1 //
    2 // Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(C2_MacroAssembler *masm);
 1191   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   address base = __ start_a_stub(size_exception_handler());
 1314   if (base == nullptr) {
 1315     ciEnv::current()->record_failure("CodeCache is full");
 1316     return 0;  // CodeBuffer::expand failed
 1317   }
 1318   int offset = __ offset();
 1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1321   __ end_a_stub();
 1322   return offset;
 1323 }
 1324 
 1325 // Emit deopt handler code.
 1326 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1327 
 1328   // Note that the code buffer's insts_mark is always relative to insts.
 1329   // That's why we must use the macroassembler to generate a handler.
 1330   address base = __ start_a_stub(size_deopt_handler());
 1331   if (base == nullptr) {
 1332     ciEnv::current()->record_failure("CodeCache is full");
 1333     return 0;  // CodeBuffer::expand failed
 1334   }
 1335   int offset = __ offset();
 1336 
 1337 #ifdef _LP64
 1338   address the_pc = (address) __ pc();
 1339   Label next;
 1340   // push a "the_pc" on the stack without destroying any registers
 1341   // as they all may be live.
 1342 
 1343   // push address of "next"
 1344   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1345   __ bind(next);
 1346   // adjust it so it matches "the_pc"
 1347   __ subptr(Address(rsp, 0), __ offset() - offset);
 1348 #else
 1349   InternalAddress here(__ pc());
 1350   __ pushptr(here.addr(), noreg);
 1351 #endif
 1352 
 1353   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1354   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1355   __ end_a_stub();
 1356   return offset;
 1357 }
 1358 
 1359 static Assembler::Width widthForType(BasicType bt) {
 1360   if (bt == T_BYTE) {
 1361     return Assembler::B;
 1362   } else if (bt == T_SHORT) {
 1363     return Assembler::W;
 1364   } else if (bt == T_INT) {
 1365     return Assembler::D;
 1366   } else {
 1367     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1368     return Assembler::Q;
 1369   }
 1370 }
 1371 
 1372 //=============================================================================
 1373 
 1374   // Float masks come from different places depending on platform.
 1375 #ifdef _LP64
 1376   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1377   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1378   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1379   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1380 #else
 1381   static address float_signmask()  { return (address)float_signmask_pool; }
 1382   static address float_signflip()  { return (address)float_signflip_pool; }
 1383   static address double_signmask() { return (address)double_signmask_pool; }
 1384   static address double_signflip() { return (address)double_signflip_pool; }
 1385 #endif
 1386   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1387   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1388   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1389   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1390   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1391   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1392   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1393   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1394   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1395   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1396   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1397   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1398   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1399   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1400   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1401 
 1402 //=============================================================================
 1403 bool Matcher::match_rule_supported(int opcode) {
 1404   if (!has_match_rule(opcode)) {
 1405     return false; // no match rule present
 1406   }
 1407   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1408   switch (opcode) {
 1409     case Op_AbsVL:
 1410     case Op_StoreVectorScatter:
 1411       if (UseAVX < 3) {
 1412         return false;
 1413       }
 1414       break;
 1415     case Op_PopCountI:
 1416     case Op_PopCountL:
 1417       if (!UsePopCountInstruction) {
 1418         return false;
 1419       }
 1420       break;
 1421     case Op_PopCountVI:
 1422       if (UseAVX < 2) {
 1423         return false;
 1424       }
 1425       break;
 1426     case Op_CompressV:
 1427     case Op_ExpandV:
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572     case Op_LoadVectorGatherMasked:
 1573       if (UseAVX < 2) {
 1574         return false;
 1575       }
 1576       break;
 1577     case Op_FmaF:
 1578     case Op_FmaD:
 1579     case Op_FmaVD:
 1580     case Op_FmaVF:
 1581       if (!UseFMA) {
 1582         return false;
 1583       }
 1584       break;
 1585     case Op_MacroLogicV:
 1586       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1587         return false;
 1588       }
 1589       break;
 1590 
 1591     case Op_VectorCmpMasked:
 1592     case Op_VectorMaskGen:
 1593       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1594         return false;
 1595       }
 1596       break;
 1597     case Op_VectorMaskFirstTrue:
 1598     case Op_VectorMaskLastTrue:
 1599     case Op_VectorMaskTrueCount:
 1600     case Op_VectorMaskToLong:
 1601       if (!is_LP64 || UseAVX < 1) {
 1602          return false;
 1603       }
 1604       break;
 1605     case Op_RoundF:
 1606     case Op_RoundD:
 1607       if (!is_LP64) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_CopySignD:
 1612     case Op_CopySignF:
 1613       if (UseAVX < 3 || !is_LP64)  {
 1614         return false;
 1615       }
 1616       if (!VM_Version::supports_avx512vl()) {
 1617         return false;
 1618       }
 1619       break;
 1620 #ifndef _LP64
 1621     case Op_AddReductionVF:
 1622     case Op_AddReductionVD:
 1623     case Op_MulReductionVF:
 1624     case Op_MulReductionVD:
 1625       if (UseSSE < 1) { // requires at least SSE
 1626         return false;
 1627       }
 1628       break;
 1629     case Op_MulAddVS2VI:
 1630     case Op_RShiftVL:
 1631     case Op_AbsVD:
 1632     case Op_NegVD:
 1633       if (UseSSE < 2) {
 1634         return false;
 1635       }
 1636       break;
 1637 #endif // !LP64
 1638     case Op_CompressBits:
 1639       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_ExpandBits:
 1644       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1645         return false;
 1646       }
 1647       break;
 1648     case Op_SignumF:
 1649       if (UseSSE < 1) {
 1650         return false;
 1651       }
 1652       break;
 1653     case Op_SignumD:
 1654       if (UseSSE < 2) {
 1655         return false;
 1656       }
 1657       break;
 1658     case Op_CompressM:
 1659       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1660         return false;
 1661       }
 1662       break;
 1663     case Op_SqrtF:
 1664       if (UseSSE < 1) {
 1665         return false;
 1666       }
 1667       break;
 1668     case Op_SqrtD:
 1669 #ifdef _LP64
 1670       if (UseSSE < 2) {
 1671         return false;
 1672       }
 1673 #else
 1674       // x86_32.ad has a special match rule for SqrtD.
 1675       // Together with common x86 rules, this handles all UseSSE cases.
 1676 #endif
 1677       break;
 1678     case Op_ConvF2HF:
 1679     case Op_ConvHF2F:
 1680       if (!VM_Version::supports_float16()) {
 1681         return false;
 1682       }
 1683       break;
 1684     case Op_VectorCastF2HF:
 1685     case Op_VectorCastHF2F:
 1686       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1687         return false;
 1688       }
 1689       break;
 1690   }
 1691   return true;  // Match rules are supported by default.
 1692 }
 1693 
 1694 //------------------------------------------------------------------------
 1695 
 1696 static inline bool is_pop_count_instr_target(BasicType bt) {
 1697   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1698          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1699 }
 1700 
 1701 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1702   return match_rule_supported_vector(opcode, vlen, bt);
 1703 }
 1704 
 1705 // Identify extra cases that we might want to provide match rules for vector nodes and
 1706 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1707 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1708   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1709   if (!match_rule_supported(opcode)) {
 1710     return false;
 1711   }
 1712   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1713   //   * SSE2 supports 128bit vectors for all types;
 1714   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1715   //   * AVX2 supports 256bit vectors for all types;
 1716   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1717   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1718   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1719   // And MaxVectorSize is taken into account as well.
 1720   if (!vector_size_supported(bt, vlen)) {
 1721     return false;
 1722   }
 1723   // Special cases which require vector length follow:
 1724   //   * implementation limitations
 1725   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1726   //   * 128bit vroundpd instruction is present only in AVX1
 1727   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1728   switch (opcode) {
 1729     case Op_AbsVF:
 1730     case Op_NegVF:
 1731       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1732         return false; // 512bit vandps and vxorps are not available
 1733       }
 1734       break;
 1735     case Op_AbsVD:
 1736     case Op_NegVD:
 1737       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1738         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1739       }
 1740       break;
 1741     case Op_RotateRightV:
 1742     case Op_RotateLeftV:
 1743       if (bt != T_INT && bt != T_LONG) {
 1744         return false;
 1745       } // fallthrough
 1746     case Op_MacroLogicV:
 1747       if (!VM_Version::supports_evex() ||
 1748           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1749         return false;
 1750       }
 1751       break;
 1752     case Op_ClearArray:
 1753     case Op_VectorMaskGen:
 1754     case Op_VectorCmpMasked:
 1755       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1756         return false;
 1757       }
 1758       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1759         return false;
 1760       }
 1761       break;
 1762     case Op_LoadVectorMasked:
 1763     case Op_StoreVectorMasked:
 1764       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1765         return false;
 1766       }
 1767       break;
 1768     case Op_MaxV:
 1769     case Op_MinV:
 1770       if (UseSSE < 4 && is_integral_type(bt)) {
 1771         return false;
 1772       }
 1773       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1774           // Float/Double intrinsics are enabled for AVX family currently.
 1775           if (UseAVX == 0) {
 1776             return false;
 1777           }
 1778           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1779             return false;
 1780           }
 1781       }
 1782       break;
 1783     case Op_CallLeafVector:
 1784       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1785         return false;
 1786       }
 1787       break;
 1788     case Op_AddReductionVI:
 1789       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1790         return false;
 1791       }
 1792       // fallthrough
 1793     case Op_AndReductionV:
 1794     case Op_OrReductionV:
 1795     case Op_XorReductionV:
 1796       if (is_subword_type(bt) && (UseSSE < 4)) {
 1797         return false;
 1798       }
 1799 #ifndef _LP64
 1800       if (bt == T_BYTE || bt == T_LONG) {
 1801         return false;
 1802       }
 1803 #endif
 1804       break;
 1805 #ifndef _LP64
 1806     case Op_VectorInsert:
 1807       if (bt == T_LONG || bt == T_DOUBLE) {
 1808         return false;
 1809       }
 1810       break;
 1811 #endif
 1812     case Op_MinReductionV:
 1813     case Op_MaxReductionV:
 1814       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1815         return false;
 1816       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1817         return false;
 1818       }
 1819       // Float/Double intrinsics enabled for AVX family.
 1820       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1821         return false;
 1822       }
 1823       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1824         return false;
 1825       }
 1826 #ifndef _LP64
 1827       if (bt == T_BYTE || bt == T_LONG) {
 1828         return false;
 1829       }
 1830 #endif
 1831       break;
 1832     case Op_VectorTest:
 1833       if (UseSSE < 4) {
 1834         return false; // Implementation limitation
 1835       } else if (size_in_bits < 32) {
 1836         return false; // Implementation limitation
 1837       }
 1838       break;
 1839     case Op_VectorLoadShuffle:
 1840     case Op_VectorRearrange:
 1841       if(vlen == 2) {
 1842         return false; // Implementation limitation due to how shuffle is loaded
 1843       } else if (size_in_bits == 256 && UseAVX < 2) {
 1844         return false; // Implementation limitation
 1845       }
 1846       break;
 1847     case Op_VectorLoadMask:
 1848     case Op_VectorMaskCast:
 1849       if (size_in_bits == 256 && UseAVX < 2) {
 1850         return false; // Implementation limitation
 1851       }
 1852       // fallthrough
 1853     case Op_VectorStoreMask:
 1854       if (vlen == 2) {
 1855         return false; // Implementation limitation
 1856       }
 1857       break;
 1858     case Op_PopulateIndex:
 1859       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1860         return false;
 1861       }
 1862       break;
 1863     case Op_VectorCastB2X:
 1864     case Op_VectorCastS2X:
 1865     case Op_VectorCastI2X:
 1866       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1867         return false;
 1868       }
 1869       break;
 1870     case Op_VectorCastL2X:
 1871       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1872         return false;
 1873       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1874         return false;
 1875       }
 1876       break;
 1877     case Op_VectorCastF2X: {
 1878         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1879         // happen after intermediate conversion to integer and special handling
 1880         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1881         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1882         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1883           return false;
 1884         }
 1885       }
 1886       // fallthrough
 1887     case Op_VectorCastD2X:
 1888       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1889         return false;
 1890       }
 1891       break;
 1892     case Op_VectorCastF2HF:
 1893     case Op_VectorCastHF2F:
 1894       if (!VM_Version::supports_f16c() &&
 1895          ((!VM_Version::supports_evex() ||
 1896          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1897         return false;
 1898       }
 1899       break;
 1900     case Op_RoundVD:
 1901       if (!VM_Version::supports_avx512dq()) {
 1902         return false;
 1903       }
 1904       break;
 1905     case Op_MulReductionVI:
 1906       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1907         return false;
 1908       }
 1909       break;
 1910     case Op_LoadVectorGatherMasked:
 1911       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1912         return false;
 1913       }
 1914       if (is_subword_type(bt) &&
 1915          (!is_LP64                                                ||
 1916          (size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1917          (size_in_bits < 64)                                      ||
 1918          (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1919         return false;
 1920       }
 1921       break;
 1922     case Op_StoreVectorScatterMasked:
 1923     case Op_StoreVectorScatter:
 1924       if (is_subword_type(bt)) {
 1925         return false;
 1926       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1927         return false;
 1928       }
 1929       // fallthrough
 1930     case Op_LoadVectorGather:
 1931       if (!is_subword_type(bt) && size_in_bits == 64) {
 1932         return false;
 1933       }
 1934       if (is_subword_type(bt) && size_in_bits < 64) {
 1935         return false;
 1936       }
 1937       break;
 1938     case Op_SelectFromTwoVector:
 1939        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1940          return false;
 1941        }
 1942        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1943          return false;
 1944        }
 1945        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1946          return false;
 1947        }
 1948        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1949          return false;
 1950        }
 1951        break;
 1952     case Op_MaskAll:
 1953       if (!VM_Version::supports_evex()) {
 1954         return false;
 1955       }
 1956       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1957         return false;
 1958       }
 1959       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1960         return false;
 1961       }
 1962       break;
 1963     case Op_VectorMaskCmp:
 1964       if (vlen < 2 || size_in_bits < 32) {
 1965         return false;
 1966       }
 1967       break;
 1968     case Op_CompressM:
 1969       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1970         return false;
 1971       }
 1972       break;
 1973     case Op_CompressV:
 1974     case Op_ExpandV:
 1975       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1976         return false;
 1977       }
 1978       if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
 1979         return false;
 1980       }
 1981       if (size_in_bits < 128 ) {
 1982         return false;
 1983       }
 1984     case Op_VectorLongToMask:
 1985       if (UseAVX < 1 || !is_LP64) {
 1986         return false;
 1987       }
 1988       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1989         return false;
 1990       }
 1991       break;
 1992     case Op_SignumVD:
 1993     case Op_SignumVF:
 1994       if (UseAVX < 1) {
 1995         return false;
 1996       }
 1997       break;
 1998     case Op_PopCountVI:
 1999     case Op_PopCountVL: {
 2000         if (!is_pop_count_instr_target(bt) &&
 2001             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 2002           return false;
 2003         }
 2004       }
 2005       break;
 2006     case Op_ReverseV:
 2007     case Op_ReverseBytesV:
 2008       if (UseAVX < 2) {
 2009         return false;
 2010       }
 2011       break;
 2012     case Op_CountTrailingZerosV:
 2013     case Op_CountLeadingZerosV:
 2014       if (UseAVX < 2) {
 2015         return false;
 2016       }
 2017       break;
 2018   }
 2019   return true;  // Per default match rules are supported.
 2020 }
 2021 
 2022 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2023   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2024   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2025   // of their non-masked counterpart with mask edge being the differentiator.
 2026   // This routine does a strict check on the existence of masked operation patterns
 2027   // by returning a default false value for all the other opcodes apart from the
 2028   // ones whose masked instruction patterns are defined in this file.
 2029   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2030     return false;
 2031   }
 2032 
 2033   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2034   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2035   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2036     return false;
 2037   }
 2038   switch(opcode) {
 2039     // Unary masked operations
 2040     case Op_AbsVB:
 2041     case Op_AbsVS:
 2042       if(!VM_Version::supports_avx512bw()) {
 2043         return false;  // Implementation limitation
 2044       }
 2045     case Op_AbsVI:
 2046     case Op_AbsVL:
 2047       return true;
 2048 
 2049     // Ternary masked operations
 2050     case Op_FmaVF:
 2051     case Op_FmaVD:
 2052       return true;
 2053 
 2054     case Op_MacroLogicV:
 2055       if(bt != T_INT && bt != T_LONG) {
 2056         return false;
 2057       }
 2058       return true;
 2059 
 2060     // Binary masked operations
 2061     case Op_AddVB:
 2062     case Op_AddVS:
 2063     case Op_SubVB:
 2064     case Op_SubVS:
 2065     case Op_MulVS:
 2066     case Op_LShiftVS:
 2067     case Op_RShiftVS:
 2068     case Op_URShiftVS:
 2069       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2070       if (!VM_Version::supports_avx512bw()) {
 2071         return false;  // Implementation limitation
 2072       }
 2073       return true;
 2074 
 2075     case Op_MulVL:
 2076       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2077       if (!VM_Version::supports_avx512dq()) {
 2078         return false;  // Implementation limitation
 2079       }
 2080       return true;
 2081 
 2082     case Op_AndV:
 2083     case Op_OrV:
 2084     case Op_XorV:
 2085     case Op_RotateRightV:
 2086     case Op_RotateLeftV:
 2087       if (bt != T_INT && bt != T_LONG) {
 2088         return false; // Implementation limitation
 2089       }
 2090       return true;
 2091 
 2092     case Op_VectorLoadMask:
 2093       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2094       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2095         return false;
 2096       }
 2097       return true;
 2098 
 2099     case Op_AddVI:
 2100     case Op_AddVL:
 2101     case Op_AddVF:
 2102     case Op_AddVD:
 2103     case Op_SubVI:
 2104     case Op_SubVL:
 2105     case Op_SubVF:
 2106     case Op_SubVD:
 2107     case Op_MulVI:
 2108     case Op_MulVF:
 2109     case Op_MulVD:
 2110     case Op_DivVF:
 2111     case Op_DivVD:
 2112     case Op_SqrtVF:
 2113     case Op_SqrtVD:
 2114     case Op_LShiftVI:
 2115     case Op_LShiftVL:
 2116     case Op_RShiftVI:
 2117     case Op_RShiftVL:
 2118     case Op_URShiftVI:
 2119     case Op_URShiftVL:
 2120     case Op_LoadVectorMasked:
 2121     case Op_StoreVectorMasked:
 2122     case Op_LoadVectorGatherMasked:
 2123     case Op_StoreVectorScatterMasked:
 2124       return true;
 2125 
 2126     case Op_MaxV:
 2127     case Op_MinV:
 2128       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2129         return false; // Implementation limitation
 2130       }
 2131       if (is_floating_point_type(bt)) {
 2132         return false; // Implementation limitation
 2133       }
 2134       return true;
 2135 
 2136     case Op_VectorMaskCmp:
 2137       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2138         return false; // Implementation limitation
 2139       }
 2140       return true;
 2141 
 2142     case Op_VectorRearrange:
 2143       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2144         return false; // Implementation limitation
 2145       }
 2146       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2147         return false; // Implementation limitation
 2148       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2149         return false; // Implementation limitation
 2150       }
 2151       return true;
 2152 
 2153     // Binary Logical operations
 2154     case Op_AndVMask:
 2155     case Op_OrVMask:
 2156     case Op_XorVMask:
 2157       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2158         return false; // Implementation limitation
 2159       }
 2160       return true;
 2161 
 2162     case Op_PopCountVI:
 2163     case Op_PopCountVL:
 2164       if (!is_pop_count_instr_target(bt)) {
 2165         return false;
 2166       }
 2167       return true;
 2168 
 2169     case Op_MaskAll:
 2170       return true;
 2171 
 2172     case Op_CountLeadingZerosV:
 2173       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2174         return true;
 2175       }
 2176     default:
 2177       return false;
 2178   }
 2179 }
 2180 
 2181 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2182   return false;
 2183 }
 2184 
 2185 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2186   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2187   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2188   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2189       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2190     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2191     return new legVecZOper();
 2192   }
 2193   if (legacy) {
 2194     switch (ideal_reg) {
 2195       case Op_VecS: return new legVecSOper();
 2196       case Op_VecD: return new legVecDOper();
 2197       case Op_VecX: return new legVecXOper();
 2198       case Op_VecY: return new legVecYOper();
 2199       case Op_VecZ: return new legVecZOper();
 2200     }
 2201   } else {
 2202     switch (ideal_reg) {
 2203       case Op_VecS: return new vecSOper();
 2204       case Op_VecD: return new vecDOper();
 2205       case Op_VecX: return new vecXOper();
 2206       case Op_VecY: return new vecYOper();
 2207       case Op_VecZ: return new vecZOper();
 2208     }
 2209   }
 2210   ShouldNotReachHere();
 2211   return nullptr;
 2212 }
 2213 
 2214 bool Matcher::is_reg2reg_move(MachNode* m) {
 2215   switch (m->rule()) {
 2216     case MoveVec2Leg_rule:
 2217     case MoveLeg2Vec_rule:
 2218     case MoveF2VL_rule:
 2219     case MoveF2LEG_rule:
 2220     case MoveVL2F_rule:
 2221     case MoveLEG2F_rule:
 2222     case MoveD2VL_rule:
 2223     case MoveD2LEG_rule:
 2224     case MoveVL2D_rule:
 2225     case MoveLEG2D_rule:
 2226       return true;
 2227     default:
 2228       return false;
 2229   }
 2230 }
 2231 
 2232 bool Matcher::is_generic_vector(MachOper* opnd) {
 2233   switch (opnd->opcode()) {
 2234     case VEC:
 2235     case LEGVEC:
 2236       return true;
 2237     default:
 2238       return false;
 2239   }
 2240 }
 2241 
 2242 //------------------------------------------------------------------------
 2243 
 2244 const RegMask* Matcher::predicate_reg_mask(void) {
 2245   return &_VECTMASK_REG_mask;
 2246 }
 2247 
 2248 // Max vector size in bytes. 0 if not supported.
 2249 int Matcher::vector_width_in_bytes(BasicType bt) {
 2250   assert(is_java_primitive(bt), "only primitive type vectors");
 2251   if (UseSSE < 2) return 0;
 2252   // SSE2 supports 128bit vectors for all types.
 2253   // AVX2 supports 256bit vectors for all types.
 2254   // AVX2/EVEX supports 512bit vectors for all types.
 2255   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2256   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2257   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2258     size = (UseAVX > 2) ? 64 : 32;
 2259   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2260     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2261   // Use flag to limit vector size.
 2262   size = MIN2(size,(int)MaxVectorSize);
 2263   // Minimum 2 values in vector (or 4 for bytes).
 2264   switch (bt) {
 2265   case T_DOUBLE:
 2266   case T_LONG:
 2267     if (size < 16) return 0;
 2268     break;
 2269   case T_FLOAT:
 2270   case T_INT:
 2271     if (size < 8) return 0;
 2272     break;
 2273   case T_BOOLEAN:
 2274     if (size < 4) return 0;
 2275     break;
 2276   case T_CHAR:
 2277     if (size < 4) return 0;
 2278     break;
 2279   case T_BYTE:
 2280     if (size < 4) return 0;
 2281     break;
 2282   case T_SHORT:
 2283     if (size < 4) return 0;
 2284     break;
 2285   default:
 2286     ShouldNotReachHere();
 2287   }
 2288   return size;
 2289 }
 2290 
 2291 // Limits on vector size (number of elements) loaded into vector.
 2292 int Matcher::max_vector_size(const BasicType bt) {
 2293   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2294 }
 2295 int Matcher::min_vector_size(const BasicType bt) {
 2296   int max_size = max_vector_size(bt);
 2297   // Min size which can be loaded into vector is 4 bytes.
 2298   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2299   // Support for calling svml double64 vectors
 2300   if (bt == T_DOUBLE) {
 2301     size = 1;
 2302   }
 2303   return MIN2(size,max_size);
 2304 }
 2305 
 2306 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2307   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2308   // by default on Cascade Lake
 2309   if (VM_Version::is_default_intel_cascade_lake()) {
 2310     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2311   }
 2312   return Matcher::max_vector_size(bt);
 2313 }
 2314 
 2315 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2316   return -1;
 2317 }
 2318 
 2319 // Vector ideal reg corresponding to specified size in bytes
 2320 uint Matcher::vector_ideal_reg(int size) {
 2321   assert(MaxVectorSize >= size, "");
 2322   switch(size) {
 2323     case  4: return Op_VecS;
 2324     case  8: return Op_VecD;
 2325     case 16: return Op_VecX;
 2326     case 32: return Op_VecY;
 2327     case 64: return Op_VecZ;
 2328   }
 2329   ShouldNotReachHere();
 2330   return 0;
 2331 }
 2332 
 2333 // Check for shift by small constant as well
 2334 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2335   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2336       shift->in(2)->get_int() <= 3 &&
 2337       // Are there other uses besides address expressions?
 2338       !matcher->is_visited(shift)) {
 2339     address_visited.set(shift->_idx); // Flag as address_visited
 2340     mstack.push(shift->in(2), Matcher::Visit);
 2341     Node *conv = shift->in(1);
 2342 #ifdef _LP64
 2343     // Allow Matcher to match the rule which bypass
 2344     // ConvI2L operation for an array index on LP64
 2345     // if the index value is positive.
 2346     if (conv->Opcode() == Op_ConvI2L &&
 2347         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2348         // Are there other uses besides address expressions?
 2349         !matcher->is_visited(conv)) {
 2350       address_visited.set(conv->_idx); // Flag as address_visited
 2351       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2352     } else
 2353 #endif
 2354       mstack.push(conv, Matcher::Pre_Visit);
 2355     return true;
 2356   }
 2357   return false;
 2358 }
 2359 
 2360 // This function identifies sub-graphs in which a 'load' node is
 2361 // input to two different nodes, and such that it can be matched
 2362 // with BMI instructions like blsi, blsr, etc.
 2363 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2364 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2365 // refers to the same node.
 2366 //
 2367 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2368 // This is a temporary solution until we make DAGs expressible in ADL.
 2369 template<typename ConType>
 2370 class FusedPatternMatcher {
 2371   Node* _op1_node;
 2372   Node* _mop_node;
 2373   int _con_op;
 2374 
 2375   static int match_next(Node* n, int next_op, int next_op_idx) {
 2376     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2377       return -1;
 2378     }
 2379 
 2380     if (next_op_idx == -1) { // n is commutative, try rotations
 2381       if (n->in(1)->Opcode() == next_op) {
 2382         return 1;
 2383       } else if (n->in(2)->Opcode() == next_op) {
 2384         return 2;
 2385       }
 2386     } else {
 2387       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2388       if (n->in(next_op_idx)->Opcode() == next_op) {
 2389         return next_op_idx;
 2390       }
 2391     }
 2392     return -1;
 2393   }
 2394 
 2395  public:
 2396   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2397     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2398 
 2399   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2400              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2401              typename ConType::NativeType con_value) {
 2402     if (_op1_node->Opcode() != op1) {
 2403       return false;
 2404     }
 2405     if (_mop_node->outcnt() > 2) {
 2406       return false;
 2407     }
 2408     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2409     if (op1_op2_idx == -1) {
 2410       return false;
 2411     }
 2412     // Memory operation must be the other edge
 2413     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2414 
 2415     // Check that the mop node is really what we want
 2416     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2417       Node* op2_node = _op1_node->in(op1_op2_idx);
 2418       if (op2_node->outcnt() > 1) {
 2419         return false;
 2420       }
 2421       assert(op2_node->Opcode() == op2, "Should be");
 2422       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2423       if (op2_con_idx == -1) {
 2424         return false;
 2425       }
 2426       // Memory operation must be the other edge
 2427       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2428       // Check that the memory operation is the same node
 2429       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2430         // Now check the constant
 2431         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2432         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2433           return true;
 2434         }
 2435       }
 2436     }
 2437     return false;
 2438   }
 2439 };
 2440 
 2441 static bool is_bmi_pattern(Node* n, Node* m) {
 2442   assert(UseBMI1Instructions, "sanity");
 2443   if (n != nullptr && m != nullptr) {
 2444     if (m->Opcode() == Op_LoadI) {
 2445       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2446       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2447              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2448              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2449     } else if (m->Opcode() == Op_LoadL) {
 2450       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2451       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2452              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2453              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2454     }
 2455   }
 2456   return false;
 2457 }
 2458 
 2459 // Should the matcher clone input 'm' of node 'n'?
 2460 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2461   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2462   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2463     mstack.push(m, Visit);
 2464     return true;
 2465   }
 2466   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2467     mstack.push(m, Visit);           // m = ShiftCntV
 2468     return true;
 2469   }
 2470   if (is_encode_and_store_pattern(n, m)) {
 2471     mstack.push(m, Visit);
 2472     return true;
 2473   }
 2474   return false;
 2475 }
 2476 
 2477 // Should the Matcher clone shifts on addressing modes, expecting them
 2478 // to be subsumed into complex addressing expressions or compute them
 2479 // into registers?
 2480 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2481   Node *off = m->in(AddPNode::Offset);
 2482   if (off->is_Con()) {
 2483     address_visited.test_set(m->_idx); // Flag as address_visited
 2484     Node *adr = m->in(AddPNode::Address);
 2485 
 2486     // Intel can handle 2 adds in addressing mode
 2487     // AtomicAdd is not an addressing expression.
 2488     // Cheap to find it by looking for screwy base.
 2489     if (adr->is_AddP() &&
 2490         !adr->in(AddPNode::Base)->is_top() &&
 2491         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2492         // Are there other uses besides address expressions?
 2493         !is_visited(adr)) {
 2494       address_visited.set(adr->_idx); // Flag as address_visited
 2495       Node *shift = adr->in(AddPNode::Offset);
 2496       if (!clone_shift(shift, this, mstack, address_visited)) {
 2497         mstack.push(shift, Pre_Visit);
 2498       }
 2499       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2500       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2501     } else {
 2502       mstack.push(adr, Pre_Visit);
 2503     }
 2504 
 2505     // Clone X+offset as it also folds into most addressing expressions
 2506     mstack.push(off, Visit);
 2507     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2508     return true;
 2509   } else if (clone_shift(off, this, mstack, address_visited)) {
 2510     address_visited.test_set(m->_idx); // Flag as address_visited
 2511     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2512     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2513     return true;
 2514   }
 2515   return false;
 2516 }
 2517 
 2518 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2519   switch (bt) {
 2520     case BoolTest::eq:
 2521       return Assembler::eq;
 2522     case BoolTest::ne:
 2523       return Assembler::neq;
 2524     case BoolTest::le:
 2525     case BoolTest::ule:
 2526       return Assembler::le;
 2527     case BoolTest::ge:
 2528     case BoolTest::uge:
 2529       return Assembler::nlt;
 2530     case BoolTest::lt:
 2531     case BoolTest::ult:
 2532       return Assembler::lt;
 2533     case BoolTest::gt:
 2534     case BoolTest::ugt:
 2535       return Assembler::nle;
 2536     default : ShouldNotReachHere(); return Assembler::_false;
 2537   }
 2538 }
 2539 
 2540 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2541   switch (bt) {
 2542   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2543   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2544   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2545   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2546   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2547   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2548   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2549   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2550   }
 2551 }
 2552 
 2553 // Helper methods for MachSpillCopyNode::implementation().
 2554 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2555                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2556   assert(ireg == Op_VecS || // 32bit vector
 2557          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2558           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2559          "no non-adjacent vector moves" );
 2560   if (masm) {
 2561     switch (ireg) {
 2562     case Op_VecS: // copy whole register
 2563     case Op_VecD:
 2564     case Op_VecX:
 2565 #ifndef _LP64
 2566       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2567 #else
 2568       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2569         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2570       } else {
 2571         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2572      }
 2573 #endif
 2574       break;
 2575     case Op_VecY:
 2576 #ifndef _LP64
 2577       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2578 #else
 2579       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2580         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2581       } else {
 2582         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2583      }
 2584 #endif
 2585       break;
 2586     case Op_VecZ:
 2587       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2588       break;
 2589     default:
 2590       ShouldNotReachHere();
 2591     }
 2592 #ifndef PRODUCT
 2593   } else {
 2594     switch (ireg) {
 2595     case Op_VecS:
 2596     case Op_VecD:
 2597     case Op_VecX:
 2598       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2599       break;
 2600     case Op_VecY:
 2601     case Op_VecZ:
 2602       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2603       break;
 2604     default:
 2605       ShouldNotReachHere();
 2606     }
 2607 #endif
 2608   }
 2609 }
 2610 
 2611 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2612                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2613   if (masm) {
 2614     if (is_load) {
 2615       switch (ireg) {
 2616       case Op_VecS:
 2617         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2618         break;
 2619       case Op_VecD:
 2620         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2621         break;
 2622       case Op_VecX:
 2623 #ifndef _LP64
 2624         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2625 #else
 2626         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2627           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2628         } else {
 2629           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2630           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2631         }
 2632 #endif
 2633         break;
 2634       case Op_VecY:
 2635 #ifndef _LP64
 2636         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2637 #else
 2638         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2639           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2640         } else {
 2641           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2642           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2643         }
 2644 #endif
 2645         break;
 2646       case Op_VecZ:
 2647         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2648         break;
 2649       default:
 2650         ShouldNotReachHere();
 2651       }
 2652     } else { // store
 2653       switch (ireg) {
 2654       case Op_VecS:
 2655         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2656         break;
 2657       case Op_VecD:
 2658         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2659         break;
 2660       case Op_VecX:
 2661 #ifndef _LP64
 2662         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2663 #else
 2664         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2665           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2666         }
 2667         else {
 2668           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2669         }
 2670 #endif
 2671         break;
 2672       case Op_VecY:
 2673 #ifndef _LP64
 2674         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2675 #else
 2676         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2677           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2678         }
 2679         else {
 2680           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2681         }
 2682 #endif
 2683         break;
 2684       case Op_VecZ:
 2685         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2686         break;
 2687       default:
 2688         ShouldNotReachHere();
 2689       }
 2690     }
 2691 #ifndef PRODUCT
 2692   } else {
 2693     if (is_load) {
 2694       switch (ireg) {
 2695       case Op_VecS:
 2696         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2697         break;
 2698       case Op_VecD:
 2699         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2700         break;
 2701        case Op_VecX:
 2702         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2703         break;
 2704       case Op_VecY:
 2705       case Op_VecZ:
 2706         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2707         break;
 2708       default:
 2709         ShouldNotReachHere();
 2710       }
 2711     } else { // store
 2712       switch (ireg) {
 2713       case Op_VecS:
 2714         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2715         break;
 2716       case Op_VecD:
 2717         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2718         break;
 2719        case Op_VecX:
 2720         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2721         break;
 2722       case Op_VecY:
 2723       case Op_VecZ:
 2724         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2725         break;
 2726       default:
 2727         ShouldNotReachHere();
 2728       }
 2729     }
 2730 #endif
 2731   }
 2732 }
 2733 
 2734 template <class T>
 2735 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2736   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2737   jvalue ele;
 2738   switch (bt) {
 2739     case T_BYTE:   ele.b = con; break;
 2740     case T_SHORT:  ele.s = con; break;
 2741     case T_INT:    ele.i = con; break;
 2742     case T_LONG:   ele.j = con; break;
 2743     case T_FLOAT:  ele.f = con; break;
 2744     case T_DOUBLE: ele.d = con; break;
 2745     default: ShouldNotReachHere();
 2746   }
 2747   for (int i = 0; i < len; i++) {
 2748     val->append(ele);
 2749   }
 2750   return val;
 2751 }
 2752 
 2753 static inline jlong high_bit_set(BasicType bt) {
 2754   switch (bt) {
 2755     case T_BYTE:  return 0x8080808080808080;
 2756     case T_SHORT: return 0x8000800080008000;
 2757     case T_INT:   return 0x8000000080000000;
 2758     case T_LONG:  return 0x8000000000000000;
 2759     default:
 2760       ShouldNotReachHere();
 2761       return 0;
 2762   }
 2763 }
 2764 
 2765 #ifndef PRODUCT
 2766   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2767     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2768   }
 2769 #endif
 2770 
 2771   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2772     __ nop(_count);
 2773   }
 2774 
 2775   uint MachNopNode::size(PhaseRegAlloc*) const {
 2776     return _count;
 2777   }
 2778 
 2779 #ifndef PRODUCT
 2780   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2781     st->print("# breakpoint");
 2782   }
 2783 #endif
 2784 
 2785   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2786     __ int3();
 2787   }
 2788 
 2789   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2790     return MachNode::size(ra_);
 2791   }
 2792 
 2793 %}
 2794 
 2795 encode %{
 2796 
 2797   enc_class call_epilog %{
 2798     if (VerifyStackAtCalls) {
 2799       // Check that stack depth is unchanged: find majik cookie on stack
 2800       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2801       Label L;
 2802       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2803       __ jccb(Assembler::equal, L);
 2804       // Die if stack mismatch
 2805       __ int3();
 2806       __ bind(L);
 2807     }
 2808     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2809       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2810       // Search for the corresponding projection, get the register and emit code that initialized it.
 2811       uint con = (tf()->range_cc()->cnt() - 1);
 2812       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2813         ProjNode* proj = fast_out(i)->as_Proj();
 2814         if (proj->_con == con) {
 2815           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2816           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2817           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2818           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2819           __ testq(rax, rax);
 2820           __ setb(Assembler::notZero, toReg);
 2821           __ movzbl(toReg, toReg);
 2822           if (reg->is_stack()) {
 2823             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2824             __ movq(Address(rsp, st_off), toReg);
 2825           }
 2826           break;
 2827         }
 2828       }
 2829       if (return_value_is_used()) {
 2830         // An inline type is returned as fields in multiple registers.
 2831         // Rax either contains an oop if the inline type is buffered or a pointer
 2832         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2833         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2834         // rax &= (rax & 1) - 1
 2835         __ movptr(rscratch1, rax);
 2836         __ andptr(rscratch1, 0x1);
 2837         __ subptr(rscratch1, 0x1);
 2838         __ andptr(rax, rscratch1);
 2839       }
 2840     }
 2841   %}
 2842 
 2843 %}
 2844 
 2845 // Operands for bound floating pointer register arguments
 2846 operand rxmm0() %{
 2847   constraint(ALLOC_IN_RC(xmm0_reg));
 2848   match(VecX);
 2849   format%{%}
 2850   interface(REG_INTER);
 2851 %}
 2852 
 2853 //----------OPERANDS-----------------------------------------------------------
 2854 // Operand definitions must precede instruction definitions for correct parsing
 2855 // in the ADLC because operands constitute user defined types which are used in
 2856 // instruction definitions.
 2857 
 2858 // Vectors
 2859 
 2860 // Dummy generic vector class. Should be used for all vector operands.
 2861 // Replaced with vec[SDXYZ] during post-selection pass.
 2862 operand vec() %{
 2863   constraint(ALLOC_IN_RC(dynamic));
 2864   match(VecX);
 2865   match(VecY);
 2866   match(VecZ);
 2867   match(VecS);
 2868   match(VecD);
 2869 
 2870   format %{ %}
 2871   interface(REG_INTER);
 2872 %}
 2873 
 2874 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2875 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2876 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2877 // runtime code generation via reg_class_dynamic.
 2878 operand legVec() %{
 2879   constraint(ALLOC_IN_RC(dynamic));
 2880   match(VecX);
 2881   match(VecY);
 2882   match(VecZ);
 2883   match(VecS);
 2884   match(VecD);
 2885 
 2886   format %{ %}
 2887   interface(REG_INTER);
 2888 %}
 2889 
 2890 // Replaces vec during post-selection cleanup. See above.
 2891 operand vecS() %{
 2892   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2893   match(VecS);
 2894 
 2895   format %{ %}
 2896   interface(REG_INTER);
 2897 %}
 2898 
 2899 // Replaces legVec during post-selection cleanup. See above.
 2900 operand legVecS() %{
 2901   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2902   match(VecS);
 2903 
 2904   format %{ %}
 2905   interface(REG_INTER);
 2906 %}
 2907 
 2908 // Replaces vec during post-selection cleanup. See above.
 2909 operand vecD() %{
 2910   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2911   match(VecD);
 2912 
 2913   format %{ %}
 2914   interface(REG_INTER);
 2915 %}
 2916 
 2917 // Replaces legVec during post-selection cleanup. See above.
 2918 operand legVecD() %{
 2919   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2920   match(VecD);
 2921 
 2922   format %{ %}
 2923   interface(REG_INTER);
 2924 %}
 2925 
 2926 // Replaces vec during post-selection cleanup. See above.
 2927 operand vecX() %{
 2928   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2929   match(VecX);
 2930 
 2931   format %{ %}
 2932   interface(REG_INTER);
 2933 %}
 2934 
 2935 // Replaces legVec during post-selection cleanup. See above.
 2936 operand legVecX() %{
 2937   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2938   match(VecX);
 2939 
 2940   format %{ %}
 2941   interface(REG_INTER);
 2942 %}
 2943 
 2944 // Replaces vec during post-selection cleanup. See above.
 2945 operand vecY() %{
 2946   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2947   match(VecY);
 2948 
 2949   format %{ %}
 2950   interface(REG_INTER);
 2951 %}
 2952 
 2953 // Replaces legVec during post-selection cleanup. See above.
 2954 operand legVecY() %{
 2955   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2956   match(VecY);
 2957 
 2958   format %{ %}
 2959   interface(REG_INTER);
 2960 %}
 2961 
 2962 // Replaces vec during post-selection cleanup. See above.
 2963 operand vecZ() %{
 2964   constraint(ALLOC_IN_RC(vectorz_reg));
 2965   match(VecZ);
 2966 
 2967   format %{ %}
 2968   interface(REG_INTER);
 2969 %}
 2970 
 2971 // Replaces legVec during post-selection cleanup. See above.
 2972 operand legVecZ() %{
 2973   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2974   match(VecZ);
 2975 
 2976   format %{ %}
 2977   interface(REG_INTER);
 2978 %}
 2979 
 2980 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2981 
 2982 // ============================================================================
 2983 
 2984 instruct ShouldNotReachHere() %{
 2985   match(Halt);
 2986   format %{ "stop\t# ShouldNotReachHere" %}
 2987   ins_encode %{
 2988     if (is_reachable()) {
 2989       __ stop(_halt_reason);
 2990     }
 2991   %}
 2992   ins_pipe(pipe_slow);
 2993 %}
 2994 
 2995 // ============================================================================
 2996 
 2997 instruct addF_reg(regF dst, regF src) %{
 2998   predicate((UseSSE>=1) && (UseAVX == 0));
 2999   match(Set dst (AddF dst src));
 3000 
 3001   format %{ "addss   $dst, $src" %}
 3002   ins_cost(150);
 3003   ins_encode %{
 3004     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3005   %}
 3006   ins_pipe(pipe_slow);
 3007 %}
 3008 
 3009 instruct addF_mem(regF dst, memory src) %{
 3010   predicate((UseSSE>=1) && (UseAVX == 0));
 3011   match(Set dst (AddF dst (LoadF src)));
 3012 
 3013   format %{ "addss   $dst, $src" %}
 3014   ins_cost(150);
 3015   ins_encode %{
 3016     __ addss($dst$$XMMRegister, $src$$Address);
 3017   %}
 3018   ins_pipe(pipe_slow);
 3019 %}
 3020 
 3021 instruct addF_imm(regF dst, immF con) %{
 3022   predicate((UseSSE>=1) && (UseAVX == 0));
 3023   match(Set dst (AddF dst con));
 3024   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3025   ins_cost(150);
 3026   ins_encode %{
 3027     __ addss($dst$$XMMRegister, $constantaddress($con));
 3028   %}
 3029   ins_pipe(pipe_slow);
 3030 %}
 3031 
 3032 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3033   predicate(UseAVX > 0);
 3034   match(Set dst (AddF src1 src2));
 3035 
 3036   format %{ "vaddss  $dst, $src1, $src2" %}
 3037   ins_cost(150);
 3038   ins_encode %{
 3039     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3040   %}
 3041   ins_pipe(pipe_slow);
 3042 %}
 3043 
 3044 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3045   predicate(UseAVX > 0);
 3046   match(Set dst (AddF src1 (LoadF src2)));
 3047 
 3048   format %{ "vaddss  $dst, $src1, $src2" %}
 3049   ins_cost(150);
 3050   ins_encode %{
 3051     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3052   %}
 3053   ins_pipe(pipe_slow);
 3054 %}
 3055 
 3056 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3057   predicate(UseAVX > 0);
 3058   match(Set dst (AddF src con));
 3059 
 3060   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3061   ins_cost(150);
 3062   ins_encode %{
 3063     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3064   %}
 3065   ins_pipe(pipe_slow);
 3066 %}
 3067 
 3068 instruct addD_reg(regD dst, regD src) %{
 3069   predicate((UseSSE>=2) && (UseAVX == 0));
 3070   match(Set dst (AddD dst src));
 3071 
 3072   format %{ "addsd   $dst, $src" %}
 3073   ins_cost(150);
 3074   ins_encode %{
 3075     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3076   %}
 3077   ins_pipe(pipe_slow);
 3078 %}
 3079 
 3080 instruct addD_mem(regD dst, memory src) %{
 3081   predicate((UseSSE>=2) && (UseAVX == 0));
 3082   match(Set dst (AddD dst (LoadD src)));
 3083 
 3084   format %{ "addsd   $dst, $src" %}
 3085   ins_cost(150);
 3086   ins_encode %{
 3087     __ addsd($dst$$XMMRegister, $src$$Address);
 3088   %}
 3089   ins_pipe(pipe_slow);
 3090 %}
 3091 
 3092 instruct addD_imm(regD dst, immD con) %{
 3093   predicate((UseSSE>=2) && (UseAVX == 0));
 3094   match(Set dst (AddD dst con));
 3095   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3096   ins_cost(150);
 3097   ins_encode %{
 3098     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3099   %}
 3100   ins_pipe(pipe_slow);
 3101 %}
 3102 
 3103 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3104   predicate(UseAVX > 0);
 3105   match(Set dst (AddD src1 src2));
 3106 
 3107   format %{ "vaddsd  $dst, $src1, $src2" %}
 3108   ins_cost(150);
 3109   ins_encode %{
 3110     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3111   %}
 3112   ins_pipe(pipe_slow);
 3113 %}
 3114 
 3115 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3116   predicate(UseAVX > 0);
 3117   match(Set dst (AddD src1 (LoadD src2)));
 3118 
 3119   format %{ "vaddsd  $dst, $src1, $src2" %}
 3120   ins_cost(150);
 3121   ins_encode %{
 3122     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3123   %}
 3124   ins_pipe(pipe_slow);
 3125 %}
 3126 
 3127 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3128   predicate(UseAVX > 0);
 3129   match(Set dst (AddD src con));
 3130 
 3131   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3132   ins_cost(150);
 3133   ins_encode %{
 3134     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3135   %}
 3136   ins_pipe(pipe_slow);
 3137 %}
 3138 
 3139 instruct subF_reg(regF dst, regF src) %{
 3140   predicate((UseSSE>=1) && (UseAVX == 0));
 3141   match(Set dst (SubF dst src));
 3142 
 3143   format %{ "subss   $dst, $src" %}
 3144   ins_cost(150);
 3145   ins_encode %{
 3146     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3147   %}
 3148   ins_pipe(pipe_slow);
 3149 %}
 3150 
 3151 instruct subF_mem(regF dst, memory src) %{
 3152   predicate((UseSSE>=1) && (UseAVX == 0));
 3153   match(Set dst (SubF dst (LoadF src)));
 3154 
 3155   format %{ "subss   $dst, $src" %}
 3156   ins_cost(150);
 3157   ins_encode %{
 3158     __ subss($dst$$XMMRegister, $src$$Address);
 3159   %}
 3160   ins_pipe(pipe_slow);
 3161 %}
 3162 
 3163 instruct subF_imm(regF dst, immF con) %{
 3164   predicate((UseSSE>=1) && (UseAVX == 0));
 3165   match(Set dst (SubF dst con));
 3166   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3167   ins_cost(150);
 3168   ins_encode %{
 3169     __ subss($dst$$XMMRegister, $constantaddress($con));
 3170   %}
 3171   ins_pipe(pipe_slow);
 3172 %}
 3173 
 3174 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3175   predicate(UseAVX > 0);
 3176   match(Set dst (SubF src1 src2));
 3177 
 3178   format %{ "vsubss  $dst, $src1, $src2" %}
 3179   ins_cost(150);
 3180   ins_encode %{
 3181     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3182   %}
 3183   ins_pipe(pipe_slow);
 3184 %}
 3185 
 3186 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3187   predicate(UseAVX > 0);
 3188   match(Set dst (SubF src1 (LoadF src2)));
 3189 
 3190   format %{ "vsubss  $dst, $src1, $src2" %}
 3191   ins_cost(150);
 3192   ins_encode %{
 3193     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3194   %}
 3195   ins_pipe(pipe_slow);
 3196 %}
 3197 
 3198 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3199   predicate(UseAVX > 0);
 3200   match(Set dst (SubF src con));
 3201 
 3202   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3203   ins_cost(150);
 3204   ins_encode %{
 3205     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3206   %}
 3207   ins_pipe(pipe_slow);
 3208 %}
 3209 
 3210 instruct subD_reg(regD dst, regD src) %{
 3211   predicate((UseSSE>=2) && (UseAVX == 0));
 3212   match(Set dst (SubD dst src));
 3213 
 3214   format %{ "subsd   $dst, $src" %}
 3215   ins_cost(150);
 3216   ins_encode %{
 3217     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3218   %}
 3219   ins_pipe(pipe_slow);
 3220 %}
 3221 
 3222 instruct subD_mem(regD dst, memory src) %{
 3223   predicate((UseSSE>=2) && (UseAVX == 0));
 3224   match(Set dst (SubD dst (LoadD src)));
 3225 
 3226   format %{ "subsd   $dst, $src" %}
 3227   ins_cost(150);
 3228   ins_encode %{
 3229     __ subsd($dst$$XMMRegister, $src$$Address);
 3230   %}
 3231   ins_pipe(pipe_slow);
 3232 %}
 3233 
 3234 instruct subD_imm(regD dst, immD con) %{
 3235   predicate((UseSSE>=2) && (UseAVX == 0));
 3236   match(Set dst (SubD dst con));
 3237   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3238   ins_cost(150);
 3239   ins_encode %{
 3240     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3241   %}
 3242   ins_pipe(pipe_slow);
 3243 %}
 3244 
 3245 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3246   predicate(UseAVX > 0);
 3247   match(Set dst (SubD src1 src2));
 3248 
 3249   format %{ "vsubsd  $dst, $src1, $src2" %}
 3250   ins_cost(150);
 3251   ins_encode %{
 3252     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3253   %}
 3254   ins_pipe(pipe_slow);
 3255 %}
 3256 
 3257 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3258   predicate(UseAVX > 0);
 3259   match(Set dst (SubD src1 (LoadD src2)));
 3260 
 3261   format %{ "vsubsd  $dst, $src1, $src2" %}
 3262   ins_cost(150);
 3263   ins_encode %{
 3264     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3265   %}
 3266   ins_pipe(pipe_slow);
 3267 %}
 3268 
 3269 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3270   predicate(UseAVX > 0);
 3271   match(Set dst (SubD src con));
 3272 
 3273   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3274   ins_cost(150);
 3275   ins_encode %{
 3276     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3277   %}
 3278   ins_pipe(pipe_slow);
 3279 %}
 3280 
 3281 instruct mulF_reg(regF dst, regF src) %{
 3282   predicate((UseSSE>=1) && (UseAVX == 0));
 3283   match(Set dst (MulF dst src));
 3284 
 3285   format %{ "mulss   $dst, $src" %}
 3286   ins_cost(150);
 3287   ins_encode %{
 3288     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3289   %}
 3290   ins_pipe(pipe_slow);
 3291 %}
 3292 
 3293 instruct mulF_mem(regF dst, memory src) %{
 3294   predicate((UseSSE>=1) && (UseAVX == 0));
 3295   match(Set dst (MulF dst (LoadF src)));
 3296 
 3297   format %{ "mulss   $dst, $src" %}
 3298   ins_cost(150);
 3299   ins_encode %{
 3300     __ mulss($dst$$XMMRegister, $src$$Address);
 3301   %}
 3302   ins_pipe(pipe_slow);
 3303 %}
 3304 
 3305 instruct mulF_imm(regF dst, immF con) %{
 3306   predicate((UseSSE>=1) && (UseAVX == 0));
 3307   match(Set dst (MulF dst con));
 3308   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3309   ins_cost(150);
 3310   ins_encode %{
 3311     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3312   %}
 3313   ins_pipe(pipe_slow);
 3314 %}
 3315 
 3316 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3317   predicate(UseAVX > 0);
 3318   match(Set dst (MulF src1 src2));
 3319 
 3320   format %{ "vmulss  $dst, $src1, $src2" %}
 3321   ins_cost(150);
 3322   ins_encode %{
 3323     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3324   %}
 3325   ins_pipe(pipe_slow);
 3326 %}
 3327 
 3328 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3329   predicate(UseAVX > 0);
 3330   match(Set dst (MulF src1 (LoadF src2)));
 3331 
 3332   format %{ "vmulss  $dst, $src1, $src2" %}
 3333   ins_cost(150);
 3334   ins_encode %{
 3335     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3336   %}
 3337   ins_pipe(pipe_slow);
 3338 %}
 3339 
 3340 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3341   predicate(UseAVX > 0);
 3342   match(Set dst (MulF src con));
 3343 
 3344   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3345   ins_cost(150);
 3346   ins_encode %{
 3347     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3348   %}
 3349   ins_pipe(pipe_slow);
 3350 %}
 3351 
 3352 instruct mulD_reg(regD dst, regD src) %{
 3353   predicate((UseSSE>=2) && (UseAVX == 0));
 3354   match(Set dst (MulD dst src));
 3355 
 3356   format %{ "mulsd   $dst, $src" %}
 3357   ins_cost(150);
 3358   ins_encode %{
 3359     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3360   %}
 3361   ins_pipe(pipe_slow);
 3362 %}
 3363 
 3364 instruct mulD_mem(regD dst, memory src) %{
 3365   predicate((UseSSE>=2) && (UseAVX == 0));
 3366   match(Set dst (MulD dst (LoadD src)));
 3367 
 3368   format %{ "mulsd   $dst, $src" %}
 3369   ins_cost(150);
 3370   ins_encode %{
 3371     __ mulsd($dst$$XMMRegister, $src$$Address);
 3372   %}
 3373   ins_pipe(pipe_slow);
 3374 %}
 3375 
 3376 instruct mulD_imm(regD dst, immD con) %{
 3377   predicate((UseSSE>=2) && (UseAVX == 0));
 3378   match(Set dst (MulD dst con));
 3379   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3380   ins_cost(150);
 3381   ins_encode %{
 3382     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3383   %}
 3384   ins_pipe(pipe_slow);
 3385 %}
 3386 
 3387 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3388   predicate(UseAVX > 0);
 3389   match(Set dst (MulD src1 src2));
 3390 
 3391   format %{ "vmulsd  $dst, $src1, $src2" %}
 3392   ins_cost(150);
 3393   ins_encode %{
 3394     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3395   %}
 3396   ins_pipe(pipe_slow);
 3397 %}
 3398 
 3399 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3400   predicate(UseAVX > 0);
 3401   match(Set dst (MulD src1 (LoadD src2)));
 3402 
 3403   format %{ "vmulsd  $dst, $src1, $src2" %}
 3404   ins_cost(150);
 3405   ins_encode %{
 3406     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3407   %}
 3408   ins_pipe(pipe_slow);
 3409 %}
 3410 
 3411 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3412   predicate(UseAVX > 0);
 3413   match(Set dst (MulD src con));
 3414 
 3415   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3416   ins_cost(150);
 3417   ins_encode %{
 3418     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3419   %}
 3420   ins_pipe(pipe_slow);
 3421 %}
 3422 
 3423 instruct divF_reg(regF dst, regF src) %{
 3424   predicate((UseSSE>=1) && (UseAVX == 0));
 3425   match(Set dst (DivF dst src));
 3426 
 3427   format %{ "divss   $dst, $src" %}
 3428   ins_cost(150);
 3429   ins_encode %{
 3430     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3431   %}
 3432   ins_pipe(pipe_slow);
 3433 %}
 3434 
 3435 instruct divF_mem(regF dst, memory src) %{
 3436   predicate((UseSSE>=1) && (UseAVX == 0));
 3437   match(Set dst (DivF dst (LoadF src)));
 3438 
 3439   format %{ "divss   $dst, $src" %}
 3440   ins_cost(150);
 3441   ins_encode %{
 3442     __ divss($dst$$XMMRegister, $src$$Address);
 3443   %}
 3444   ins_pipe(pipe_slow);
 3445 %}
 3446 
 3447 instruct divF_imm(regF dst, immF con) %{
 3448   predicate((UseSSE>=1) && (UseAVX == 0));
 3449   match(Set dst (DivF dst con));
 3450   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3451   ins_cost(150);
 3452   ins_encode %{
 3453     __ divss($dst$$XMMRegister, $constantaddress($con));
 3454   %}
 3455   ins_pipe(pipe_slow);
 3456 %}
 3457 
 3458 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3459   predicate(UseAVX > 0);
 3460   match(Set dst (DivF src1 src2));
 3461 
 3462   format %{ "vdivss  $dst, $src1, $src2" %}
 3463   ins_cost(150);
 3464   ins_encode %{
 3465     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3466   %}
 3467   ins_pipe(pipe_slow);
 3468 %}
 3469 
 3470 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3471   predicate(UseAVX > 0);
 3472   match(Set dst (DivF src1 (LoadF src2)));
 3473 
 3474   format %{ "vdivss  $dst, $src1, $src2" %}
 3475   ins_cost(150);
 3476   ins_encode %{
 3477     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3478   %}
 3479   ins_pipe(pipe_slow);
 3480 %}
 3481 
 3482 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3483   predicate(UseAVX > 0);
 3484   match(Set dst (DivF src con));
 3485 
 3486   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3487   ins_cost(150);
 3488   ins_encode %{
 3489     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3490   %}
 3491   ins_pipe(pipe_slow);
 3492 %}
 3493 
 3494 instruct divD_reg(regD dst, regD src) %{
 3495   predicate((UseSSE>=2) && (UseAVX == 0));
 3496   match(Set dst (DivD dst src));
 3497 
 3498   format %{ "divsd   $dst, $src" %}
 3499   ins_cost(150);
 3500   ins_encode %{
 3501     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3502   %}
 3503   ins_pipe(pipe_slow);
 3504 %}
 3505 
 3506 instruct divD_mem(regD dst, memory src) %{
 3507   predicate((UseSSE>=2) && (UseAVX == 0));
 3508   match(Set dst (DivD dst (LoadD src)));
 3509 
 3510   format %{ "divsd   $dst, $src" %}
 3511   ins_cost(150);
 3512   ins_encode %{
 3513     __ divsd($dst$$XMMRegister, $src$$Address);
 3514   %}
 3515   ins_pipe(pipe_slow);
 3516 %}
 3517 
 3518 instruct divD_imm(regD dst, immD con) %{
 3519   predicate((UseSSE>=2) && (UseAVX == 0));
 3520   match(Set dst (DivD dst con));
 3521   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3522   ins_cost(150);
 3523   ins_encode %{
 3524     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3525   %}
 3526   ins_pipe(pipe_slow);
 3527 %}
 3528 
 3529 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3530   predicate(UseAVX > 0);
 3531   match(Set dst (DivD src1 src2));
 3532 
 3533   format %{ "vdivsd  $dst, $src1, $src2" %}
 3534   ins_cost(150);
 3535   ins_encode %{
 3536     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3537   %}
 3538   ins_pipe(pipe_slow);
 3539 %}
 3540 
 3541 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3542   predicate(UseAVX > 0);
 3543   match(Set dst (DivD src1 (LoadD src2)));
 3544 
 3545   format %{ "vdivsd  $dst, $src1, $src2" %}
 3546   ins_cost(150);
 3547   ins_encode %{
 3548     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3549   %}
 3550   ins_pipe(pipe_slow);
 3551 %}
 3552 
 3553 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3554   predicate(UseAVX > 0);
 3555   match(Set dst (DivD src con));
 3556 
 3557   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3558   ins_cost(150);
 3559   ins_encode %{
 3560     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3561   %}
 3562   ins_pipe(pipe_slow);
 3563 %}
 3564 
 3565 instruct absF_reg(regF dst) %{
 3566   predicate((UseSSE>=1) && (UseAVX == 0));
 3567   match(Set dst (AbsF dst));
 3568   ins_cost(150);
 3569   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3570   ins_encode %{
 3571     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3572   %}
 3573   ins_pipe(pipe_slow);
 3574 %}
 3575 
 3576 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3577   predicate(UseAVX > 0);
 3578   match(Set dst (AbsF src));
 3579   ins_cost(150);
 3580   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3581   ins_encode %{
 3582     int vlen_enc = Assembler::AVX_128bit;
 3583     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3584               ExternalAddress(float_signmask()), vlen_enc);
 3585   %}
 3586   ins_pipe(pipe_slow);
 3587 %}
 3588 
 3589 instruct absD_reg(regD dst) %{
 3590   predicate((UseSSE>=2) && (UseAVX == 0));
 3591   match(Set dst (AbsD dst));
 3592   ins_cost(150);
 3593   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3594             "# abs double by sign masking" %}
 3595   ins_encode %{
 3596     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3597   %}
 3598   ins_pipe(pipe_slow);
 3599 %}
 3600 
 3601 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3602   predicate(UseAVX > 0);
 3603   match(Set dst (AbsD src));
 3604   ins_cost(150);
 3605   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3606             "# abs double by sign masking" %}
 3607   ins_encode %{
 3608     int vlen_enc = Assembler::AVX_128bit;
 3609     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3610               ExternalAddress(double_signmask()), vlen_enc);
 3611   %}
 3612   ins_pipe(pipe_slow);
 3613 %}
 3614 
 3615 instruct negF_reg(regF dst) %{
 3616   predicate((UseSSE>=1) && (UseAVX == 0));
 3617   match(Set dst (NegF dst));
 3618   ins_cost(150);
 3619   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3620   ins_encode %{
 3621     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3622   %}
 3623   ins_pipe(pipe_slow);
 3624 %}
 3625 
 3626 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3627   predicate(UseAVX > 0);
 3628   match(Set dst (NegF src));
 3629   ins_cost(150);
 3630   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3631   ins_encode %{
 3632     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3633                  ExternalAddress(float_signflip()));
 3634   %}
 3635   ins_pipe(pipe_slow);
 3636 %}
 3637 
 3638 instruct negD_reg(regD dst) %{
 3639   predicate((UseSSE>=2) && (UseAVX == 0));
 3640   match(Set dst (NegD dst));
 3641   ins_cost(150);
 3642   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3643             "# neg double by sign flipping" %}
 3644   ins_encode %{
 3645     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3646   %}
 3647   ins_pipe(pipe_slow);
 3648 %}
 3649 
 3650 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3651   predicate(UseAVX > 0);
 3652   match(Set dst (NegD src));
 3653   ins_cost(150);
 3654   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3655             "# neg double by sign flipping" %}
 3656   ins_encode %{
 3657     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3658                  ExternalAddress(double_signflip()));
 3659   %}
 3660   ins_pipe(pipe_slow);
 3661 %}
 3662 
 3663 // sqrtss instruction needs destination register to be pre initialized for best performance
 3664 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3665 instruct sqrtF_reg(regF dst) %{
 3666   predicate(UseSSE>=1);
 3667   match(Set dst (SqrtF dst));
 3668   format %{ "sqrtss  $dst, $dst" %}
 3669   ins_encode %{
 3670     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3671   %}
 3672   ins_pipe(pipe_slow);
 3673 %}
 3674 
 3675 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3676 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3677 instruct sqrtD_reg(regD dst) %{
 3678   predicate(UseSSE>=2);
 3679   match(Set dst (SqrtD dst));
 3680   format %{ "sqrtsd  $dst, $dst" %}
 3681   ins_encode %{
 3682     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3683   %}
 3684   ins_pipe(pipe_slow);
 3685 %}
 3686 
 3687 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3688   effect(TEMP tmp);
 3689   match(Set dst (ConvF2HF src));
 3690   ins_cost(125);
 3691   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3692   ins_encode %{
 3693     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3694   %}
 3695   ins_pipe( pipe_slow );
 3696 %}
 3697 
 3698 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3699   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3700   effect(TEMP ktmp, TEMP rtmp);
 3701   match(Set mem (StoreC mem (ConvF2HF src)));
 3702   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3703   ins_encode %{
 3704     __ movl($rtmp$$Register, 0x1);
 3705     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3706     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3707   %}
 3708   ins_pipe( pipe_slow );
 3709 %}
 3710 
 3711 instruct vconvF2HF(vec dst, vec src) %{
 3712   match(Set dst (VectorCastF2HF src));
 3713   format %{ "vector_conv_F2HF $dst $src" %}
 3714   ins_encode %{
 3715     int vlen_enc = vector_length_encoding(this, $src);
 3716     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3717   %}
 3718   ins_pipe( pipe_slow );
 3719 %}
 3720 
 3721 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3722   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3723   format %{ "vcvtps2ph $mem,$src" %}
 3724   ins_encode %{
 3725     int vlen_enc = vector_length_encoding(this, $src);
 3726     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3727   %}
 3728   ins_pipe( pipe_slow );
 3729 %}
 3730 
 3731 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3732   match(Set dst (ConvHF2F src));
 3733   format %{ "vcvtph2ps $dst,$src" %}
 3734   ins_encode %{
 3735     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3736   %}
 3737   ins_pipe( pipe_slow );
 3738 %}
 3739 
 3740 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3741   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3742   format %{ "vcvtph2ps $dst,$mem" %}
 3743   ins_encode %{
 3744     int vlen_enc = vector_length_encoding(this);
 3745     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3746   %}
 3747   ins_pipe( pipe_slow );
 3748 %}
 3749 
 3750 instruct vconvHF2F(vec dst, vec src) %{
 3751   match(Set dst (VectorCastHF2F src));
 3752   ins_cost(125);
 3753   format %{ "vector_conv_HF2F $dst,$src" %}
 3754   ins_encode %{
 3755     int vlen_enc = vector_length_encoding(this);
 3756     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3757   %}
 3758   ins_pipe( pipe_slow );
 3759 %}
 3760 
 3761 // ---------------------------------------- VectorReinterpret ------------------------------------
 3762 instruct reinterpret_mask(kReg dst) %{
 3763   predicate(n->bottom_type()->isa_vectmask() &&
 3764             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3765   match(Set dst (VectorReinterpret dst));
 3766   ins_cost(125);
 3767   format %{ "vector_reinterpret $dst\t!" %}
 3768   ins_encode %{
 3769     // empty
 3770   %}
 3771   ins_pipe( pipe_slow );
 3772 %}
 3773 
 3774 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3775   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3776             n->bottom_type()->isa_vectmask() &&
 3777             n->in(1)->bottom_type()->isa_vectmask() &&
 3778             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3779             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3780   match(Set dst (VectorReinterpret src));
 3781   effect(TEMP xtmp);
 3782   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3783   ins_encode %{
 3784      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3785      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3786      assert(src_sz == dst_sz , "src and dst size mismatch");
 3787      int vlen_enc = vector_length_encoding(src_sz);
 3788      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3789      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3790   %}
 3791   ins_pipe( pipe_slow );
 3792 %}
 3793 
 3794 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3795   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3796             n->bottom_type()->isa_vectmask() &&
 3797             n->in(1)->bottom_type()->isa_vectmask() &&
 3798             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3799              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3800             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3801   match(Set dst (VectorReinterpret src));
 3802   effect(TEMP xtmp);
 3803   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3804   ins_encode %{
 3805      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3806      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3807      assert(src_sz == dst_sz , "src and dst size mismatch");
 3808      int vlen_enc = vector_length_encoding(src_sz);
 3809      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3810      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3811   %}
 3812   ins_pipe( pipe_slow );
 3813 %}
 3814 
 3815 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3816   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3817             n->bottom_type()->isa_vectmask() &&
 3818             n->in(1)->bottom_type()->isa_vectmask() &&
 3819             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3820              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3821             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3822   match(Set dst (VectorReinterpret src));
 3823   effect(TEMP xtmp);
 3824   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3825   ins_encode %{
 3826      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3827      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3828      assert(src_sz == dst_sz , "src and dst size mismatch");
 3829      int vlen_enc = vector_length_encoding(src_sz);
 3830      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3831      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3832   %}
 3833   ins_pipe( pipe_slow );
 3834 %}
 3835 
 3836 instruct reinterpret(vec dst) %{
 3837   predicate(!n->bottom_type()->isa_vectmask() &&
 3838             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3839   match(Set dst (VectorReinterpret dst));
 3840   ins_cost(125);
 3841   format %{ "vector_reinterpret $dst\t!" %}
 3842   ins_encode %{
 3843     // empty
 3844   %}
 3845   ins_pipe( pipe_slow );
 3846 %}
 3847 
 3848 instruct reinterpret_expand(vec dst, vec src) %{
 3849   predicate(UseAVX == 0 &&
 3850             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3851   match(Set dst (VectorReinterpret src));
 3852   ins_cost(125);
 3853   effect(TEMP dst);
 3854   format %{ "vector_reinterpret_expand $dst,$src" %}
 3855   ins_encode %{
 3856     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3857     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3858 
 3859     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3860     if (src_vlen_in_bytes == 4) {
 3861       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3862     } else {
 3863       assert(src_vlen_in_bytes == 8, "");
 3864       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3865     }
 3866     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3867   %}
 3868   ins_pipe( pipe_slow );
 3869 %}
 3870 
 3871 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3872   predicate(UseAVX > 0 &&
 3873             !n->bottom_type()->isa_vectmask() &&
 3874             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3875             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3876   match(Set dst (VectorReinterpret src));
 3877   ins_cost(125);
 3878   format %{ "vector_reinterpret_expand $dst,$src" %}
 3879   ins_encode %{
 3880     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3881   %}
 3882   ins_pipe( pipe_slow );
 3883 %}
 3884 
 3885 
 3886 instruct vreinterpret_expand(legVec dst, vec src) %{
 3887   predicate(UseAVX > 0 &&
 3888             !n->bottom_type()->isa_vectmask() &&
 3889             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3890             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3891   match(Set dst (VectorReinterpret src));
 3892   ins_cost(125);
 3893   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3894   ins_encode %{
 3895     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3896       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3897       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3898       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3899       default: ShouldNotReachHere();
 3900     }
 3901   %}
 3902   ins_pipe( pipe_slow );
 3903 %}
 3904 
 3905 instruct reinterpret_shrink(vec dst, legVec src) %{
 3906   predicate(!n->bottom_type()->isa_vectmask() &&
 3907             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3908   match(Set dst (VectorReinterpret src));
 3909   ins_cost(125);
 3910   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3911   ins_encode %{
 3912     switch (Matcher::vector_length_in_bytes(this)) {
 3913       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3914       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3915       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3916       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3917       default: ShouldNotReachHere();
 3918     }
 3919   %}
 3920   ins_pipe( pipe_slow );
 3921 %}
 3922 
 3923 // ----------------------------------------------------------------------------------------------------
 3924 
 3925 #ifdef _LP64
 3926 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3927   match(Set dst (RoundDoubleMode src rmode));
 3928   format %{ "roundsd $dst,$src" %}
 3929   ins_cost(150);
 3930   ins_encode %{
 3931     assert(UseSSE >= 4, "required");
 3932     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3933       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3934     }
 3935     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3936   %}
 3937   ins_pipe(pipe_slow);
 3938 %}
 3939 
 3940 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3941   match(Set dst (RoundDoubleMode con rmode));
 3942   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3943   ins_cost(150);
 3944   ins_encode %{
 3945     assert(UseSSE >= 4, "required");
 3946     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3947   %}
 3948   ins_pipe(pipe_slow);
 3949 %}
 3950 
 3951 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3952   predicate(Matcher::vector_length(n) < 8);
 3953   match(Set dst (RoundDoubleModeV src rmode));
 3954   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3955   ins_encode %{
 3956     assert(UseAVX > 0, "required");
 3957     int vlen_enc = vector_length_encoding(this);
 3958     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3959   %}
 3960   ins_pipe( pipe_slow );
 3961 %}
 3962 
 3963 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3964   predicate(Matcher::vector_length(n) == 8);
 3965   match(Set dst (RoundDoubleModeV src rmode));
 3966   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3967   ins_encode %{
 3968     assert(UseAVX > 2, "required");
 3969     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3970   %}
 3971   ins_pipe( pipe_slow );
 3972 %}
 3973 
 3974 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3975   predicate(Matcher::vector_length(n) < 8);
 3976   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3977   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3978   ins_encode %{
 3979     assert(UseAVX > 0, "required");
 3980     int vlen_enc = vector_length_encoding(this);
 3981     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3982   %}
 3983   ins_pipe( pipe_slow );
 3984 %}
 3985 
 3986 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3987   predicate(Matcher::vector_length(n) == 8);
 3988   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3989   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3990   ins_encode %{
 3991     assert(UseAVX > 2, "required");
 3992     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3993   %}
 3994   ins_pipe( pipe_slow );
 3995 %}
 3996 #endif // _LP64
 3997 
 3998 instruct onspinwait() %{
 3999   match(OnSpinWait);
 4000   ins_cost(200);
 4001 
 4002   format %{
 4003     $$template
 4004     $$emit$$"pause\t! membar_onspinwait"
 4005   %}
 4006   ins_encode %{
 4007     __ pause();
 4008   %}
 4009   ins_pipe(pipe_slow);
 4010 %}
 4011 
 4012 // a * b + c
 4013 instruct fmaD_reg(regD a, regD b, regD c) %{
 4014   match(Set c (FmaD  c (Binary a b)));
 4015   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4016   ins_cost(150);
 4017   ins_encode %{
 4018     assert(UseFMA, "Needs FMA instructions support.");
 4019     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4020   %}
 4021   ins_pipe( pipe_slow );
 4022 %}
 4023 
 4024 // a * b + c
 4025 instruct fmaF_reg(regF a, regF b, regF c) %{
 4026   match(Set c (FmaF  c (Binary a b)));
 4027   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4028   ins_cost(150);
 4029   ins_encode %{
 4030     assert(UseFMA, "Needs FMA instructions support.");
 4031     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4032   %}
 4033   ins_pipe( pipe_slow );
 4034 %}
 4035 
 4036 // ====================VECTOR INSTRUCTIONS=====================================
 4037 
 4038 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4039 instruct MoveVec2Leg(legVec dst, vec src) %{
 4040   match(Set dst src);
 4041   format %{ "" %}
 4042   ins_encode %{
 4043     ShouldNotReachHere();
 4044   %}
 4045   ins_pipe( fpu_reg_reg );
 4046 %}
 4047 
 4048 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4049   match(Set dst src);
 4050   format %{ "" %}
 4051   ins_encode %{
 4052     ShouldNotReachHere();
 4053   %}
 4054   ins_pipe( fpu_reg_reg );
 4055 %}
 4056 
 4057 // ============================================================================
 4058 
 4059 // Load vectors generic operand pattern
 4060 instruct loadV(vec dst, memory mem) %{
 4061   match(Set dst (LoadVector mem));
 4062   ins_cost(125);
 4063   format %{ "load_vector $dst,$mem" %}
 4064   ins_encode %{
 4065     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4066   %}
 4067   ins_pipe( pipe_slow );
 4068 %}
 4069 
 4070 // Store vectors generic operand pattern.
 4071 instruct storeV(memory mem, vec src) %{
 4072   match(Set mem (StoreVector mem src));
 4073   ins_cost(145);
 4074   format %{ "store_vector $mem,$src\n\t" %}
 4075   ins_encode %{
 4076     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4077       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4078       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4079       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4080       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4081       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4082       default: ShouldNotReachHere();
 4083     }
 4084   %}
 4085   ins_pipe( pipe_slow );
 4086 %}
 4087 
 4088 // ---------------------------------------- Gather ------------------------------------
 4089 
 4090 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4091 
 4092 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4093   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4094             Matcher::vector_length_in_bytes(n) <= 32);
 4095   match(Set dst (LoadVectorGather mem idx));
 4096   effect(TEMP dst, TEMP tmp, TEMP mask);
 4097   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4098   ins_encode %{
 4099     int vlen_enc = vector_length_encoding(this);
 4100     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4101     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4102     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4103     __ lea($tmp$$Register, $mem$$Address);
 4104     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4105   %}
 4106   ins_pipe( pipe_slow );
 4107 %}
 4108 
 4109 
 4110 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4111   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4112             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4113   match(Set dst (LoadVectorGather mem idx));
 4114   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4115   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4116   ins_encode %{
 4117     int vlen_enc = vector_length_encoding(this);
 4118     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4119     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4120     __ lea($tmp$$Register, $mem$$Address);
 4121     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4122   %}
 4123   ins_pipe( pipe_slow );
 4124 %}
 4125 
 4126 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4127   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4128             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4129   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4130   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4131   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4132   ins_encode %{
 4133     assert(UseAVX > 2, "sanity");
 4134     int vlen_enc = vector_length_encoding(this);
 4135     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4136     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4137     // Note: Since gather instruction partially updates the opmask register used
 4138     // for predication hense moving mask operand to a temporary.
 4139     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4140     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4141     __ lea($tmp$$Register, $mem$$Address);
 4142     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4143   %}
 4144   ins_pipe( pipe_slow );
 4145 %}
 4146 
 4147 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4148   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4149   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4150   effect(TEMP tmp, TEMP rtmp);
 4151   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4152   ins_encode %{
 4153     int vlen_enc = vector_length_encoding(this);
 4154     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4155     __ lea($tmp$$Register, $mem$$Address);
 4156     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4157   %}
 4158   ins_pipe( pipe_slow );
 4159 %}
 4160 
 4161 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4162                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4163   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4164   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4165   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4166   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4167   ins_encode %{
 4168     int vlen_enc = vector_length_encoding(this);
 4169     int vector_len = Matcher::vector_length(this);
 4170     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4171     __ lea($tmp$$Register, $mem$$Address);
 4172     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4173     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4174                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4175   %}
 4176   ins_pipe( pipe_slow );
 4177 %}
 4178 
 4179 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4180   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4181   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4182   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4183   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4184   ins_encode %{
 4185     int vlen_enc = vector_length_encoding(this);
 4186     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4187     __ lea($tmp$$Register, $mem$$Address);
 4188     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4189   %}
 4190   ins_pipe( pipe_slow );
 4191 %}
 4192 
 4193 
 4194 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4195                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4196   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4197   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4198   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4199   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4200   ins_encode %{
 4201     int vlen_enc = vector_length_encoding(this);
 4202     int vector_len = Matcher::vector_length(this);
 4203     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4204     __ lea($tmp$$Register, $mem$$Address);
 4205     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4206     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4207                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4208   %}
 4209   ins_pipe( pipe_slow );
 4210 %}
 4211 
 4212 
 4213 #ifdef _LP64
 4214 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4215   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4216   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4217   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4218   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4219   ins_encode %{
 4220     int vlen_enc = vector_length_encoding(this);
 4221     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4222     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4223     __ lea($tmp$$Register, $mem$$Address);
 4224     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4225     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4226   %}
 4227   ins_pipe( pipe_slow );
 4228 %}
 4229 
 4230 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4231                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4232   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4233   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4234   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4235   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4236   ins_encode %{
 4237     int vlen_enc = vector_length_encoding(this);
 4238     int vector_len = Matcher::vector_length(this);
 4239     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4240     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4241     __ lea($tmp$$Register, $mem$$Address);
 4242     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4243     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4244     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4245                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4246   %}
 4247   ins_pipe( pipe_slow );
 4248 %}
 4249 
 4250 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4251   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4252   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4253   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4254   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4255   ins_encode %{
 4256     int vlen_enc = vector_length_encoding(this);
 4257     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4258     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4259     __ lea($tmp$$Register, $mem$$Address);
 4260     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4261     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4262                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4263   %}
 4264   ins_pipe( pipe_slow );
 4265 %}
 4266 
 4267 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4268                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4269   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4270   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4271   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4272   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4273   ins_encode %{
 4274     int vlen_enc = vector_length_encoding(this);
 4275     int vector_len = Matcher::vector_length(this);
 4276     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4277     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4278     __ lea($tmp$$Register, $mem$$Address);
 4279     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4280     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4281     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4282                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4283   %}
 4284   ins_pipe( pipe_slow );
 4285 %}
 4286 
 4287 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4288   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4289   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4290   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4291   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4292   ins_encode %{
 4293     int vlen_enc = vector_length_encoding(this);
 4294     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4295     __ lea($tmp$$Register, $mem$$Address);
 4296     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4297     if (elem_bt == T_SHORT) {
 4298       __ movl($mask_idx$$Register, 0x55555555);
 4299       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4300     }
 4301     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4302     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4303   %}
 4304   ins_pipe( pipe_slow );
 4305 %}
 4306 
 4307 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4308                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4309   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4310   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4311   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4312   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4313   ins_encode %{
 4314     int vlen_enc = vector_length_encoding(this);
 4315     int vector_len = Matcher::vector_length(this);
 4316     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4317     __ lea($tmp$$Register, $mem$$Address);
 4318     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4319     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4320     if (elem_bt == T_SHORT) {
 4321       __ movl($mask_idx$$Register, 0x55555555);
 4322       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4323     }
 4324     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4325     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4326                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4327   %}
 4328   ins_pipe( pipe_slow );
 4329 %}
 4330 
 4331 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4332   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4333   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4334   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4335   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4336   ins_encode %{
 4337     int vlen_enc = vector_length_encoding(this);
 4338     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4339     __ lea($tmp$$Register, $mem$$Address);
 4340     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4341     if (elem_bt == T_SHORT) {
 4342       __ movl($mask_idx$$Register, 0x55555555);
 4343       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4344     }
 4345     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4346     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4347                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4348   %}
 4349   ins_pipe( pipe_slow );
 4350 %}
 4351 
 4352 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4353                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4354   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4355   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4356   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4357   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4358   ins_encode %{
 4359     int vlen_enc = vector_length_encoding(this);
 4360     int vector_len = Matcher::vector_length(this);
 4361     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4362     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4363     __ lea($tmp$$Register, $mem$$Address);
 4364     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4365     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4366     if (elem_bt == T_SHORT) {
 4367       __ movl($mask_idx$$Register, 0x55555555);
 4368       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4369     }
 4370     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4371     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4372                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4373   %}
 4374   ins_pipe( pipe_slow );
 4375 %}
 4376 #endif
 4377 
 4378 // ====================Scatter=======================================
 4379 
 4380 // Scatter INT, LONG, FLOAT, DOUBLE
 4381 
 4382 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4383   predicate(UseAVX > 2);
 4384   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4385   effect(TEMP tmp, TEMP ktmp);
 4386   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4387   ins_encode %{
 4388     int vlen_enc = vector_length_encoding(this, $src);
 4389     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4390 
 4391     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4392     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4393 
 4394     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4395     __ lea($tmp$$Register, $mem$$Address);
 4396     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4397   %}
 4398   ins_pipe( pipe_slow );
 4399 %}
 4400 
 4401 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4402   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4403   effect(TEMP tmp, TEMP ktmp);
 4404   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4405   ins_encode %{
 4406     int vlen_enc = vector_length_encoding(this, $src);
 4407     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4408     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4409     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4410     // Note: Since scatter instruction partially updates the opmask register used
 4411     // for predication hense moving mask operand to a temporary.
 4412     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4413     __ lea($tmp$$Register, $mem$$Address);
 4414     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4415   %}
 4416   ins_pipe( pipe_slow );
 4417 %}
 4418 
 4419 // ====================REPLICATE=======================================
 4420 
 4421 // Replicate byte scalar to be vector
 4422 instruct vReplB_reg(vec dst, rRegI src) %{
 4423   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4424   match(Set dst (Replicate src));
 4425   format %{ "replicateB $dst,$src" %}
 4426   ins_encode %{
 4427     uint vlen = Matcher::vector_length(this);
 4428     if (UseAVX >= 2) {
 4429       int vlen_enc = vector_length_encoding(this);
 4430       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4431         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4432         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4433       } else {
 4434         __ movdl($dst$$XMMRegister, $src$$Register);
 4435         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4436       }
 4437     } else {
 4438        assert(UseAVX < 2, "");
 4439       __ movdl($dst$$XMMRegister, $src$$Register);
 4440       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4441       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4442       if (vlen >= 16) {
 4443         assert(vlen == 16, "");
 4444         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4445       }
 4446     }
 4447   %}
 4448   ins_pipe( pipe_slow );
 4449 %}
 4450 
 4451 instruct ReplB_mem(vec dst, memory mem) %{
 4452   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4453   match(Set dst (Replicate (LoadB mem)));
 4454   format %{ "replicateB $dst,$mem" %}
 4455   ins_encode %{
 4456     int vlen_enc = vector_length_encoding(this);
 4457     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4458   %}
 4459   ins_pipe( pipe_slow );
 4460 %}
 4461 
 4462 // ====================ReplicateS=======================================
 4463 
 4464 instruct vReplS_reg(vec dst, rRegI src) %{
 4465   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4466   match(Set dst (Replicate src));
 4467   format %{ "replicateS $dst,$src" %}
 4468   ins_encode %{
 4469     uint vlen = Matcher::vector_length(this);
 4470     int vlen_enc = vector_length_encoding(this);
 4471     if (UseAVX >= 2) {
 4472       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4473         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4474         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4475       } else {
 4476         __ movdl($dst$$XMMRegister, $src$$Register);
 4477         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4478       }
 4479     } else {
 4480       assert(UseAVX < 2, "");
 4481       __ movdl($dst$$XMMRegister, $src$$Register);
 4482       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4483       if (vlen >= 8) {
 4484         assert(vlen == 8, "");
 4485         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4486       }
 4487     }
 4488   %}
 4489   ins_pipe( pipe_slow );
 4490 %}
 4491 
 4492 instruct ReplS_mem(vec dst, memory mem) %{
 4493   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4494   match(Set dst (Replicate (LoadS mem)));
 4495   format %{ "replicateS $dst,$mem" %}
 4496   ins_encode %{
 4497     int vlen_enc = vector_length_encoding(this);
 4498     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4499   %}
 4500   ins_pipe( pipe_slow );
 4501 %}
 4502 
 4503 // ====================ReplicateI=======================================
 4504 
 4505 instruct ReplI_reg(vec dst, rRegI src) %{
 4506   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4507   match(Set dst (Replicate src));
 4508   format %{ "replicateI $dst,$src" %}
 4509   ins_encode %{
 4510     uint vlen = Matcher::vector_length(this);
 4511     int vlen_enc = vector_length_encoding(this);
 4512     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4513       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4514     } else if (VM_Version::supports_avx2()) {
 4515       __ movdl($dst$$XMMRegister, $src$$Register);
 4516       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4517     } else {
 4518       __ movdl($dst$$XMMRegister, $src$$Register);
 4519       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4520     }
 4521   %}
 4522   ins_pipe( pipe_slow );
 4523 %}
 4524 
 4525 instruct ReplI_mem(vec dst, memory mem) %{
 4526   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4527   match(Set dst (Replicate (LoadI mem)));
 4528   format %{ "replicateI $dst,$mem" %}
 4529   ins_encode %{
 4530     int vlen_enc = vector_length_encoding(this);
 4531     if (VM_Version::supports_avx2()) {
 4532       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4533     } else if (VM_Version::supports_avx()) {
 4534       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4535     } else {
 4536       __ movdl($dst$$XMMRegister, $mem$$Address);
 4537       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4538     }
 4539   %}
 4540   ins_pipe( pipe_slow );
 4541 %}
 4542 
 4543 instruct ReplI_imm(vec dst, immI con) %{
 4544   predicate(Matcher::is_non_long_integral_vector(n));
 4545   match(Set dst (Replicate con));
 4546   format %{ "replicateI $dst,$con" %}
 4547   ins_encode %{
 4548     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4549         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4550             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4551                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4552     BasicType bt = Matcher::vector_element_basic_type(this);
 4553     int vlen = Matcher::vector_length_in_bytes(this);
 4554     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4555   %}
 4556   ins_pipe( pipe_slow );
 4557 %}
 4558 
 4559 // Replicate scalar zero to be vector
 4560 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4561   predicate(Matcher::is_non_long_integral_vector(n));
 4562   match(Set dst (Replicate zero));
 4563   format %{ "replicateI $dst,$zero" %}
 4564   ins_encode %{
 4565     int vlen_enc = vector_length_encoding(this);
 4566     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4567       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4568     } else {
 4569       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4570     }
 4571   %}
 4572   ins_pipe( fpu_reg_reg );
 4573 %}
 4574 
 4575 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4576   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4577   match(Set dst (Replicate con));
 4578   format %{ "vallones $dst" %}
 4579   ins_encode %{
 4580     int vector_len = vector_length_encoding(this);
 4581     __ vallones($dst$$XMMRegister, vector_len);
 4582   %}
 4583   ins_pipe( pipe_slow );
 4584 %}
 4585 
 4586 // ====================ReplicateL=======================================
 4587 
 4588 #ifdef _LP64
 4589 // Replicate long (8 byte) scalar to be vector
 4590 instruct ReplL_reg(vec dst, rRegL src) %{
 4591   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4592   match(Set dst (Replicate src));
 4593   format %{ "replicateL $dst,$src" %}
 4594   ins_encode %{
 4595     int vlen = Matcher::vector_length(this);
 4596     int vlen_enc = vector_length_encoding(this);
 4597     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4598       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4599     } else if (VM_Version::supports_avx2()) {
 4600       __ movdq($dst$$XMMRegister, $src$$Register);
 4601       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4602     } else {
 4603       __ movdq($dst$$XMMRegister, $src$$Register);
 4604       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4605     }
 4606   %}
 4607   ins_pipe( pipe_slow );
 4608 %}
 4609 #else // _LP64
 4610 // Replicate long (8 byte) scalar to be vector
 4611 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4612   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4613   match(Set dst (Replicate src));
 4614   effect(TEMP dst, USE src, TEMP tmp);
 4615   format %{ "replicateL $dst,$src" %}
 4616   ins_encode %{
 4617     uint vlen = Matcher::vector_length(this);
 4618     if (vlen == 2) {
 4619       __ movdl($dst$$XMMRegister, $src$$Register);
 4620       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4621       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4622       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4623     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4624       int vlen_enc = Assembler::AVX_256bit;
 4625       __ movdl($dst$$XMMRegister, $src$$Register);
 4626       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4627       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4628       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4629     } else {
 4630       __ movdl($dst$$XMMRegister, $src$$Register);
 4631       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4632       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4633       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4634       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4635     }
 4636   %}
 4637   ins_pipe( pipe_slow );
 4638 %}
 4639 
 4640 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4641   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4642   match(Set dst (Replicate src));
 4643   effect(TEMP dst, USE src, TEMP tmp);
 4644   format %{ "replicateL $dst,$src" %}
 4645   ins_encode %{
 4646     if (VM_Version::supports_avx512vl()) {
 4647       __ movdl($dst$$XMMRegister, $src$$Register);
 4648       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4649       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4650       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4651       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4652       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4653     } else {
 4654       int vlen_enc = Assembler::AVX_512bit;
 4655       __ movdl($dst$$XMMRegister, $src$$Register);
 4656       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4657       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4658       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4659     }
 4660   %}
 4661   ins_pipe( pipe_slow );
 4662 %}
 4663 #endif // _LP64
 4664 
 4665 instruct ReplL_mem(vec dst, memory mem) %{
 4666   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4667   match(Set dst (Replicate (LoadL mem)));
 4668   format %{ "replicateL $dst,$mem" %}
 4669   ins_encode %{
 4670     int vlen_enc = vector_length_encoding(this);
 4671     if (VM_Version::supports_avx2()) {
 4672       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4673     } else if (VM_Version::supports_sse3()) {
 4674       __ movddup($dst$$XMMRegister, $mem$$Address);
 4675     } else {
 4676       __ movq($dst$$XMMRegister, $mem$$Address);
 4677       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4678     }
 4679   %}
 4680   ins_pipe( pipe_slow );
 4681 %}
 4682 
 4683 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4684 instruct ReplL_imm(vec dst, immL con) %{
 4685   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4686   match(Set dst (Replicate con));
 4687   format %{ "replicateL $dst,$con" %}
 4688   ins_encode %{
 4689     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4690     int vlen = Matcher::vector_length_in_bytes(this);
 4691     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4692   %}
 4693   ins_pipe( pipe_slow );
 4694 %}
 4695 
 4696 instruct ReplL_zero(vec dst, immL0 zero) %{
 4697   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4698   match(Set dst (Replicate zero));
 4699   format %{ "replicateL $dst,$zero" %}
 4700   ins_encode %{
 4701     int vlen_enc = vector_length_encoding(this);
 4702     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4703       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4704     } else {
 4705       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4706     }
 4707   %}
 4708   ins_pipe( fpu_reg_reg );
 4709 %}
 4710 
 4711 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4712   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4713   match(Set dst (Replicate con));
 4714   format %{ "vallones $dst" %}
 4715   ins_encode %{
 4716     int vector_len = vector_length_encoding(this);
 4717     __ vallones($dst$$XMMRegister, vector_len);
 4718   %}
 4719   ins_pipe( pipe_slow );
 4720 %}
 4721 
 4722 // ====================ReplicateF=======================================
 4723 
 4724 instruct vReplF_reg(vec dst, vlRegF src) %{
 4725   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4726   match(Set dst (Replicate src));
 4727   format %{ "replicateF $dst,$src" %}
 4728   ins_encode %{
 4729     uint vlen = Matcher::vector_length(this);
 4730     int vlen_enc = vector_length_encoding(this);
 4731     if (vlen <= 4) {
 4732       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4733     } else if (VM_Version::supports_avx2()) {
 4734       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4735     } else {
 4736       assert(vlen == 8, "sanity");
 4737       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4738       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4739     }
 4740   %}
 4741   ins_pipe( pipe_slow );
 4742 %}
 4743 
 4744 instruct ReplF_reg(vec dst, vlRegF src) %{
 4745   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4746   match(Set dst (Replicate src));
 4747   format %{ "replicateF $dst,$src" %}
 4748   ins_encode %{
 4749     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4750   %}
 4751   ins_pipe( pipe_slow );
 4752 %}
 4753 
 4754 instruct ReplF_mem(vec dst, memory mem) %{
 4755   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4756   match(Set dst (Replicate (LoadF mem)));
 4757   format %{ "replicateF $dst,$mem" %}
 4758   ins_encode %{
 4759     int vlen_enc = vector_length_encoding(this);
 4760     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4761   %}
 4762   ins_pipe( pipe_slow );
 4763 %}
 4764 
 4765 // Replicate float scalar immediate to be vector by loading from const table.
 4766 instruct ReplF_imm(vec dst, immF con) %{
 4767   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4768   match(Set dst (Replicate con));
 4769   format %{ "replicateF $dst,$con" %}
 4770   ins_encode %{
 4771     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4772         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4773     int vlen = Matcher::vector_length_in_bytes(this);
 4774     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4775   %}
 4776   ins_pipe( pipe_slow );
 4777 %}
 4778 
 4779 instruct ReplF_zero(vec dst, immF0 zero) %{
 4780   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4781   match(Set dst (Replicate zero));
 4782   format %{ "replicateF $dst,$zero" %}
 4783   ins_encode %{
 4784     int vlen_enc = vector_length_encoding(this);
 4785     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4786       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4787     } else {
 4788       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4789     }
 4790   %}
 4791   ins_pipe( fpu_reg_reg );
 4792 %}
 4793 
 4794 // ====================ReplicateD=======================================
 4795 
 4796 // Replicate double (8 bytes) scalar to be vector
 4797 instruct vReplD_reg(vec dst, vlRegD src) %{
 4798   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4799   match(Set dst (Replicate src));
 4800   format %{ "replicateD $dst,$src" %}
 4801   ins_encode %{
 4802     uint vlen = Matcher::vector_length(this);
 4803     int vlen_enc = vector_length_encoding(this);
 4804     if (vlen <= 2) {
 4805       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4806     } else if (VM_Version::supports_avx2()) {
 4807       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4808     } else {
 4809       assert(vlen == 4, "sanity");
 4810       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4811       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4812     }
 4813   %}
 4814   ins_pipe( pipe_slow );
 4815 %}
 4816 
 4817 instruct ReplD_reg(vec dst, vlRegD src) %{
 4818   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4819   match(Set dst (Replicate src));
 4820   format %{ "replicateD $dst,$src" %}
 4821   ins_encode %{
 4822     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4823   %}
 4824   ins_pipe( pipe_slow );
 4825 %}
 4826 
 4827 instruct ReplD_mem(vec dst, memory mem) %{
 4828   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4829   match(Set dst (Replicate (LoadD mem)));
 4830   format %{ "replicateD $dst,$mem" %}
 4831   ins_encode %{
 4832     if (Matcher::vector_length(this) >= 4) {
 4833       int vlen_enc = vector_length_encoding(this);
 4834       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4835     } else {
 4836       __ movddup($dst$$XMMRegister, $mem$$Address);
 4837     }
 4838   %}
 4839   ins_pipe( pipe_slow );
 4840 %}
 4841 
 4842 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4843 instruct ReplD_imm(vec dst, immD con) %{
 4844   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4845   match(Set dst (Replicate con));
 4846   format %{ "replicateD $dst,$con" %}
 4847   ins_encode %{
 4848     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4849     int vlen = Matcher::vector_length_in_bytes(this);
 4850     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4851   %}
 4852   ins_pipe( pipe_slow );
 4853 %}
 4854 
 4855 instruct ReplD_zero(vec dst, immD0 zero) %{
 4856   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4857   match(Set dst (Replicate zero));
 4858   format %{ "replicateD $dst,$zero" %}
 4859   ins_encode %{
 4860     int vlen_enc = vector_length_encoding(this);
 4861     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4862       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4863     } else {
 4864       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4865     }
 4866   %}
 4867   ins_pipe( fpu_reg_reg );
 4868 %}
 4869 
 4870 // ====================VECTOR INSERT=======================================
 4871 
 4872 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4873   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4874   match(Set dst (VectorInsert (Binary dst val) idx));
 4875   format %{ "vector_insert $dst,$val,$idx" %}
 4876   ins_encode %{
 4877     assert(UseSSE >= 4, "required");
 4878     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4879 
 4880     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4881 
 4882     assert(is_integral_type(elem_bt), "");
 4883     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4884 
 4885     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4886   %}
 4887   ins_pipe( pipe_slow );
 4888 %}
 4889 
 4890 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4891   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4892   match(Set dst (VectorInsert (Binary src val) idx));
 4893   effect(TEMP vtmp);
 4894   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4895   ins_encode %{
 4896     int vlen_enc = Assembler::AVX_256bit;
 4897     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4898     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4899     int log2epr = log2(elem_per_lane);
 4900 
 4901     assert(is_integral_type(elem_bt), "sanity");
 4902     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4903 
 4904     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4905     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4906     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4907     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4908     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4909   %}
 4910   ins_pipe( pipe_slow );
 4911 %}
 4912 
 4913 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4914   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4915   match(Set dst (VectorInsert (Binary src val) idx));
 4916   effect(TEMP vtmp);
 4917   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4918   ins_encode %{
 4919     assert(UseAVX > 2, "sanity");
 4920 
 4921     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4922     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4923     int log2epr = log2(elem_per_lane);
 4924 
 4925     assert(is_integral_type(elem_bt), "");
 4926     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4927 
 4928     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4929     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4930     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4931     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4932     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4933   %}
 4934   ins_pipe( pipe_slow );
 4935 %}
 4936 
 4937 #ifdef _LP64
 4938 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4939   predicate(Matcher::vector_length(n) == 2);
 4940   match(Set dst (VectorInsert (Binary dst val) idx));
 4941   format %{ "vector_insert $dst,$val,$idx" %}
 4942   ins_encode %{
 4943     assert(UseSSE >= 4, "required");
 4944     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4945     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4946 
 4947     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4948   %}
 4949   ins_pipe( pipe_slow );
 4950 %}
 4951 
 4952 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4953   predicate(Matcher::vector_length(n) == 4);
 4954   match(Set dst (VectorInsert (Binary src val) idx));
 4955   effect(TEMP vtmp);
 4956   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4957   ins_encode %{
 4958     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4959     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4960 
 4961     uint x_idx = $idx$$constant & right_n_bits(1);
 4962     uint y_idx = ($idx$$constant >> 1) & 1;
 4963     int vlen_enc = Assembler::AVX_256bit;
 4964     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4965     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4966     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4967   %}
 4968   ins_pipe( pipe_slow );
 4969 %}
 4970 
 4971 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4972   predicate(Matcher::vector_length(n) == 8);
 4973   match(Set dst (VectorInsert (Binary src val) idx));
 4974   effect(TEMP vtmp);
 4975   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4976   ins_encode %{
 4977     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4978     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4979 
 4980     uint x_idx = $idx$$constant & right_n_bits(1);
 4981     uint y_idx = ($idx$$constant >> 1) & 3;
 4982     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4983     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4984     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4985   %}
 4986   ins_pipe( pipe_slow );
 4987 %}
 4988 #endif
 4989 
 4990 instruct insertF(vec dst, regF val, immU8 idx) %{
 4991   predicate(Matcher::vector_length(n) < 8);
 4992   match(Set dst (VectorInsert (Binary dst val) idx));
 4993   format %{ "vector_insert $dst,$val,$idx" %}
 4994   ins_encode %{
 4995     assert(UseSSE >= 4, "sanity");
 4996 
 4997     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4998     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4999 
 5000     uint x_idx = $idx$$constant & right_n_bits(2);
 5001     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5002   %}
 5003   ins_pipe( pipe_slow );
 5004 %}
 5005 
 5006 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 5007   predicate(Matcher::vector_length(n) >= 8);
 5008   match(Set dst (VectorInsert (Binary src val) idx));
 5009   effect(TEMP vtmp);
 5010   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5011   ins_encode %{
 5012     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5013     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5014 
 5015     int vlen = Matcher::vector_length(this);
 5016     uint x_idx = $idx$$constant & right_n_bits(2);
 5017     if (vlen == 8) {
 5018       uint y_idx = ($idx$$constant >> 2) & 1;
 5019       int vlen_enc = Assembler::AVX_256bit;
 5020       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5021       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5022       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5023     } else {
 5024       assert(vlen == 16, "sanity");
 5025       uint y_idx = ($idx$$constant >> 2) & 3;
 5026       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5027       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5028       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5029     }
 5030   %}
 5031   ins_pipe( pipe_slow );
 5032 %}
 5033 
 5034 #ifdef _LP64
 5035 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 5036   predicate(Matcher::vector_length(n) == 2);
 5037   match(Set dst (VectorInsert (Binary dst val) idx));
 5038   effect(TEMP tmp);
 5039   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 5040   ins_encode %{
 5041     assert(UseSSE >= 4, "sanity");
 5042     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5043     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5044 
 5045     __ movq($tmp$$Register, $val$$XMMRegister);
 5046     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 5047   %}
 5048   ins_pipe( pipe_slow );
 5049 %}
 5050 
 5051 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 5052   predicate(Matcher::vector_length(n) == 4);
 5053   match(Set dst (VectorInsert (Binary src val) idx));
 5054   effect(TEMP vtmp, TEMP tmp);
 5055   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 5056   ins_encode %{
 5057     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5058     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5059 
 5060     uint x_idx = $idx$$constant & right_n_bits(1);
 5061     uint y_idx = ($idx$$constant >> 1) & 1;
 5062     int vlen_enc = Assembler::AVX_256bit;
 5063     __ movq($tmp$$Register, $val$$XMMRegister);
 5064     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5065     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5066     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5067   %}
 5068   ins_pipe( pipe_slow );
 5069 %}
 5070 
 5071 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 5072   predicate(Matcher::vector_length(n) == 8);
 5073   match(Set dst (VectorInsert (Binary src val) idx));
 5074   effect(TEMP tmp, TEMP vtmp);
 5075   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5076   ins_encode %{
 5077     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5078     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5079 
 5080     uint x_idx = $idx$$constant & right_n_bits(1);
 5081     uint y_idx = ($idx$$constant >> 1) & 3;
 5082     __ movq($tmp$$Register, $val$$XMMRegister);
 5083     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5084     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5085     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5086   %}
 5087   ins_pipe( pipe_slow );
 5088 %}
 5089 #endif
 5090 
 5091 // ====================REDUCTION ARITHMETIC=======================================
 5092 
 5093 // =======================Int Reduction==========================================
 5094 
 5095 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5096   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 5097   match(Set dst (AddReductionVI src1 src2));
 5098   match(Set dst (MulReductionVI src1 src2));
 5099   match(Set dst (AndReductionV  src1 src2));
 5100   match(Set dst ( OrReductionV  src1 src2));
 5101   match(Set dst (XorReductionV  src1 src2));
 5102   match(Set dst (MinReductionV  src1 src2));
 5103   match(Set dst (MaxReductionV  src1 src2));
 5104   effect(TEMP vtmp1, TEMP vtmp2);
 5105   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5106   ins_encode %{
 5107     int opcode = this->ideal_Opcode();
 5108     int vlen = Matcher::vector_length(this, $src2);
 5109     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5110   %}
 5111   ins_pipe( pipe_slow );
 5112 %}
 5113 
 5114 // =======================Long Reduction==========================================
 5115 
 5116 #ifdef _LP64
 5117 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5118   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 5119   match(Set dst (AddReductionVL src1 src2));
 5120   match(Set dst (MulReductionVL src1 src2));
 5121   match(Set dst (AndReductionV  src1 src2));
 5122   match(Set dst ( OrReductionV  src1 src2));
 5123   match(Set dst (XorReductionV  src1 src2));
 5124   match(Set dst (MinReductionV  src1 src2));
 5125   match(Set dst (MaxReductionV  src1 src2));
 5126   effect(TEMP vtmp1, TEMP vtmp2);
 5127   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5128   ins_encode %{
 5129     int opcode = this->ideal_Opcode();
 5130     int vlen = Matcher::vector_length(this, $src2);
 5131     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5132   %}
 5133   ins_pipe( pipe_slow );
 5134 %}
 5135 
 5136 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5137   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5138   match(Set dst (AddReductionVL src1 src2));
 5139   match(Set dst (MulReductionVL src1 src2));
 5140   match(Set dst (AndReductionV  src1 src2));
 5141   match(Set dst ( OrReductionV  src1 src2));
 5142   match(Set dst (XorReductionV  src1 src2));
 5143   match(Set dst (MinReductionV  src1 src2));
 5144   match(Set dst (MaxReductionV  src1 src2));
 5145   effect(TEMP vtmp1, TEMP vtmp2);
 5146   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5147   ins_encode %{
 5148     int opcode = this->ideal_Opcode();
 5149     int vlen = Matcher::vector_length(this, $src2);
 5150     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5151   %}
 5152   ins_pipe( pipe_slow );
 5153 %}
 5154 #endif // _LP64
 5155 
 5156 // =======================Float Reduction==========================================
 5157 
 5158 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5159   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5160   match(Set dst (AddReductionVF dst src));
 5161   match(Set dst (MulReductionVF dst src));
 5162   effect(TEMP dst, TEMP vtmp);
 5163   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5164   ins_encode %{
 5165     int opcode = this->ideal_Opcode();
 5166     int vlen = Matcher::vector_length(this, $src);
 5167     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5168   %}
 5169   ins_pipe( pipe_slow );
 5170 %}
 5171 
 5172 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5173   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5174   match(Set dst (AddReductionVF dst src));
 5175   match(Set dst (MulReductionVF dst src));
 5176   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5177   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5178   ins_encode %{
 5179     int opcode = this->ideal_Opcode();
 5180     int vlen = Matcher::vector_length(this, $src);
 5181     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5182   %}
 5183   ins_pipe( pipe_slow );
 5184 %}
 5185 
 5186 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5187   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5188   match(Set dst (AddReductionVF dst src));
 5189   match(Set dst (MulReductionVF dst src));
 5190   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5191   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5192   ins_encode %{
 5193     int opcode = this->ideal_Opcode();
 5194     int vlen = Matcher::vector_length(this, $src);
 5195     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5196   %}
 5197   ins_pipe( pipe_slow );
 5198 %}
 5199 
 5200 
 5201 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5202   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5203   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5204   // src1 contains reduction identity
 5205   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5206   match(Set dst (AddReductionVF src1 src2));
 5207   match(Set dst (MulReductionVF src1 src2));
 5208   effect(TEMP dst);
 5209   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5210   ins_encode %{
 5211     int opcode = this->ideal_Opcode();
 5212     int vlen = Matcher::vector_length(this, $src2);
 5213     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5214   %}
 5215   ins_pipe( pipe_slow );
 5216 %}
 5217 
 5218 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5219   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5220   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5221   // src1 contains reduction identity
 5222   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5223   match(Set dst (AddReductionVF src1 src2));
 5224   match(Set dst (MulReductionVF src1 src2));
 5225   effect(TEMP dst, TEMP vtmp);
 5226   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5227   ins_encode %{
 5228     int opcode = this->ideal_Opcode();
 5229     int vlen = Matcher::vector_length(this, $src2);
 5230     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5231   %}
 5232   ins_pipe( pipe_slow );
 5233 %}
 5234 
 5235 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5236   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5237   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5238   // src1 contains reduction identity
 5239   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5240   match(Set dst (AddReductionVF src1 src2));
 5241   match(Set dst (MulReductionVF src1 src2));
 5242   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5243   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5244   ins_encode %{
 5245     int opcode = this->ideal_Opcode();
 5246     int vlen = Matcher::vector_length(this, $src2);
 5247     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5248   %}
 5249   ins_pipe( pipe_slow );
 5250 %}
 5251 
 5252 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5253   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5254   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5255   // src1 contains reduction identity
 5256   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5257   match(Set dst (AddReductionVF src1 src2));
 5258   match(Set dst (MulReductionVF src1 src2));
 5259   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5260   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5261   ins_encode %{
 5262     int opcode = this->ideal_Opcode();
 5263     int vlen = Matcher::vector_length(this, $src2);
 5264     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5265   %}
 5266   ins_pipe( pipe_slow );
 5267 %}
 5268 
 5269 // =======================Double Reduction==========================================
 5270 
 5271 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5272   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5273   match(Set dst (AddReductionVD dst src));
 5274   match(Set dst (MulReductionVD dst src));
 5275   effect(TEMP dst, TEMP vtmp);
 5276   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5277   ins_encode %{
 5278     int opcode = this->ideal_Opcode();
 5279     int vlen = Matcher::vector_length(this, $src);
 5280     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5281 %}
 5282   ins_pipe( pipe_slow );
 5283 %}
 5284 
 5285 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5286   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5287   match(Set dst (AddReductionVD dst src));
 5288   match(Set dst (MulReductionVD dst src));
 5289   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5290   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5291   ins_encode %{
 5292     int opcode = this->ideal_Opcode();
 5293     int vlen = Matcher::vector_length(this, $src);
 5294     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5295   %}
 5296   ins_pipe( pipe_slow );
 5297 %}
 5298 
 5299 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5300   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5301   match(Set dst (AddReductionVD dst src));
 5302   match(Set dst (MulReductionVD dst src));
 5303   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5304   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5305   ins_encode %{
 5306     int opcode = this->ideal_Opcode();
 5307     int vlen = Matcher::vector_length(this, $src);
 5308     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5309   %}
 5310   ins_pipe( pipe_slow );
 5311 %}
 5312 
 5313 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5314   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5315   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5316   // src1 contains reduction identity
 5317   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5318   match(Set dst (AddReductionVD src1 src2));
 5319   match(Set dst (MulReductionVD src1 src2));
 5320   effect(TEMP dst);
 5321   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5322   ins_encode %{
 5323     int opcode = this->ideal_Opcode();
 5324     int vlen = Matcher::vector_length(this, $src2);
 5325     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5326 %}
 5327   ins_pipe( pipe_slow );
 5328 %}
 5329 
 5330 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5331   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5332   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5333   // src1 contains reduction identity
 5334   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5335   match(Set dst (AddReductionVD src1 src2));
 5336   match(Set dst (MulReductionVD src1 src2));
 5337   effect(TEMP dst, TEMP vtmp);
 5338   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5339   ins_encode %{
 5340     int opcode = this->ideal_Opcode();
 5341     int vlen = Matcher::vector_length(this, $src2);
 5342     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5343   %}
 5344   ins_pipe( pipe_slow );
 5345 %}
 5346 
 5347 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5348   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5349   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5350   // src1 contains reduction identity
 5351   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5352   match(Set dst (AddReductionVD src1 src2));
 5353   match(Set dst (MulReductionVD src1 src2));
 5354   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5355   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5356   ins_encode %{
 5357     int opcode = this->ideal_Opcode();
 5358     int vlen = Matcher::vector_length(this, $src2);
 5359     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5360   %}
 5361   ins_pipe( pipe_slow );
 5362 %}
 5363 
 5364 // =======================Byte Reduction==========================================
 5365 
 5366 #ifdef _LP64
 5367 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5368   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5369   match(Set dst (AddReductionVI src1 src2));
 5370   match(Set dst (AndReductionV  src1 src2));
 5371   match(Set dst ( OrReductionV  src1 src2));
 5372   match(Set dst (XorReductionV  src1 src2));
 5373   match(Set dst (MinReductionV  src1 src2));
 5374   match(Set dst (MaxReductionV  src1 src2));
 5375   effect(TEMP vtmp1, TEMP vtmp2);
 5376   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5377   ins_encode %{
 5378     int opcode = this->ideal_Opcode();
 5379     int vlen = Matcher::vector_length(this, $src2);
 5380     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5381   %}
 5382   ins_pipe( pipe_slow );
 5383 %}
 5384 
 5385 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5386   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5387   match(Set dst (AddReductionVI src1 src2));
 5388   match(Set dst (AndReductionV  src1 src2));
 5389   match(Set dst ( OrReductionV  src1 src2));
 5390   match(Set dst (XorReductionV  src1 src2));
 5391   match(Set dst (MinReductionV  src1 src2));
 5392   match(Set dst (MaxReductionV  src1 src2));
 5393   effect(TEMP vtmp1, TEMP vtmp2);
 5394   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5395   ins_encode %{
 5396     int opcode = this->ideal_Opcode();
 5397     int vlen = Matcher::vector_length(this, $src2);
 5398     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5399   %}
 5400   ins_pipe( pipe_slow );
 5401 %}
 5402 #endif
 5403 
 5404 // =======================Short Reduction==========================================
 5405 
 5406 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5407   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5408   match(Set dst (AddReductionVI src1 src2));
 5409   match(Set dst (MulReductionVI src1 src2));
 5410   match(Set dst (AndReductionV  src1 src2));
 5411   match(Set dst ( OrReductionV  src1 src2));
 5412   match(Set dst (XorReductionV  src1 src2));
 5413   match(Set dst (MinReductionV  src1 src2));
 5414   match(Set dst (MaxReductionV  src1 src2));
 5415   effect(TEMP vtmp1, TEMP vtmp2);
 5416   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5417   ins_encode %{
 5418     int opcode = this->ideal_Opcode();
 5419     int vlen = Matcher::vector_length(this, $src2);
 5420     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5421   %}
 5422   ins_pipe( pipe_slow );
 5423 %}
 5424 
 5425 // =======================Mul Reduction==========================================
 5426 
 5427 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5428   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5429             Matcher::vector_length(n->in(2)) <= 32); // src2
 5430   match(Set dst (MulReductionVI src1 src2));
 5431   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5432   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5433   ins_encode %{
 5434     int opcode = this->ideal_Opcode();
 5435     int vlen = Matcher::vector_length(this, $src2);
 5436     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5437   %}
 5438   ins_pipe( pipe_slow );
 5439 %}
 5440 
 5441 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5442   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5443             Matcher::vector_length(n->in(2)) == 64); // src2
 5444   match(Set dst (MulReductionVI src1 src2));
 5445   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5446   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5447   ins_encode %{
 5448     int opcode = this->ideal_Opcode();
 5449     int vlen = Matcher::vector_length(this, $src2);
 5450     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5451   %}
 5452   ins_pipe( pipe_slow );
 5453 %}
 5454 
 5455 //--------------------Min/Max Float Reduction --------------------
 5456 // Float Min Reduction
 5457 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5458                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5459   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5460             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5461              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5462             Matcher::vector_length(n->in(2)) == 2);
 5463   match(Set dst (MinReductionV src1 src2));
 5464   match(Set dst (MaxReductionV src1 src2));
 5465   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5466   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5467   ins_encode %{
 5468     assert(UseAVX > 0, "sanity");
 5469 
 5470     int opcode = this->ideal_Opcode();
 5471     int vlen = Matcher::vector_length(this, $src2);
 5472     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5473                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5474   %}
 5475   ins_pipe( pipe_slow );
 5476 %}
 5477 
 5478 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5479                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5480   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5481             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5482              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5483             Matcher::vector_length(n->in(2)) >= 4);
 5484   match(Set dst (MinReductionV src1 src2));
 5485   match(Set dst (MaxReductionV src1 src2));
 5486   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5487   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5488   ins_encode %{
 5489     assert(UseAVX > 0, "sanity");
 5490 
 5491     int opcode = this->ideal_Opcode();
 5492     int vlen = Matcher::vector_length(this, $src2);
 5493     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5494                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5495   %}
 5496   ins_pipe( pipe_slow );
 5497 %}
 5498 
 5499 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5500                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5501   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5502             Matcher::vector_length(n->in(2)) == 2);
 5503   match(Set dst (MinReductionV dst src));
 5504   match(Set dst (MaxReductionV dst src));
 5505   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5506   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5507   ins_encode %{
 5508     assert(UseAVX > 0, "sanity");
 5509 
 5510     int opcode = this->ideal_Opcode();
 5511     int vlen = Matcher::vector_length(this, $src);
 5512     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5513                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5514   %}
 5515   ins_pipe( pipe_slow );
 5516 %}
 5517 
 5518 
 5519 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5520                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5521   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5522             Matcher::vector_length(n->in(2)) >= 4);
 5523   match(Set dst (MinReductionV dst src));
 5524   match(Set dst (MaxReductionV dst src));
 5525   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5526   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5527   ins_encode %{
 5528     assert(UseAVX > 0, "sanity");
 5529 
 5530     int opcode = this->ideal_Opcode();
 5531     int vlen = Matcher::vector_length(this, $src);
 5532     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5533                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5534   %}
 5535   ins_pipe( pipe_slow );
 5536 %}
 5537 
 5538 
 5539 //--------------------Min Double Reduction --------------------
 5540 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5541                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5542                             rFlagsReg cr) %{
 5543   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5544             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5545              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5546             Matcher::vector_length(n->in(2)) == 2);
 5547   match(Set dst (MinReductionV src1 src2));
 5548   match(Set dst (MaxReductionV src1 src2));
 5549   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5550   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5551   ins_encode %{
 5552     assert(UseAVX > 0, "sanity");
 5553 
 5554     int opcode = this->ideal_Opcode();
 5555     int vlen = Matcher::vector_length(this, $src2);
 5556     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5557                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5558   %}
 5559   ins_pipe( pipe_slow );
 5560 %}
 5561 
 5562 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5563                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5564                            rFlagsReg cr) %{
 5565   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5566             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5567              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5568             Matcher::vector_length(n->in(2)) >= 4);
 5569   match(Set dst (MinReductionV src1 src2));
 5570   match(Set dst (MaxReductionV src1 src2));
 5571   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5572   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5573   ins_encode %{
 5574     assert(UseAVX > 0, "sanity");
 5575 
 5576     int opcode = this->ideal_Opcode();
 5577     int vlen = Matcher::vector_length(this, $src2);
 5578     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5579                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5580   %}
 5581   ins_pipe( pipe_slow );
 5582 %}
 5583 
 5584 
 5585 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5586                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5587                                rFlagsReg cr) %{
 5588   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5589             Matcher::vector_length(n->in(2)) == 2);
 5590   match(Set dst (MinReductionV dst src));
 5591   match(Set dst (MaxReductionV dst src));
 5592   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5593   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5594   ins_encode %{
 5595     assert(UseAVX > 0, "sanity");
 5596 
 5597     int opcode = this->ideal_Opcode();
 5598     int vlen = Matcher::vector_length(this, $src);
 5599     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5600                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5601   %}
 5602   ins_pipe( pipe_slow );
 5603 %}
 5604 
 5605 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5606                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5607                               rFlagsReg cr) %{
 5608   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5609             Matcher::vector_length(n->in(2)) >= 4);
 5610   match(Set dst (MinReductionV dst src));
 5611   match(Set dst (MaxReductionV dst src));
 5612   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5613   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5614   ins_encode %{
 5615     assert(UseAVX > 0, "sanity");
 5616 
 5617     int opcode = this->ideal_Opcode();
 5618     int vlen = Matcher::vector_length(this, $src);
 5619     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5620                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5621   %}
 5622   ins_pipe( pipe_slow );
 5623 %}
 5624 
 5625 // ====================VECTOR ARITHMETIC=======================================
 5626 
 5627 // --------------------------------- ADD --------------------------------------
 5628 
 5629 // Bytes vector add
 5630 instruct vaddB(vec dst, vec src) %{
 5631   predicate(UseAVX == 0);
 5632   match(Set dst (AddVB dst src));
 5633   format %{ "paddb   $dst,$src\t! add packedB" %}
 5634   ins_encode %{
 5635     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5636   %}
 5637   ins_pipe( pipe_slow );
 5638 %}
 5639 
 5640 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5641   predicate(UseAVX > 0);
 5642   match(Set dst (AddVB src1 src2));
 5643   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5644   ins_encode %{
 5645     int vlen_enc = vector_length_encoding(this);
 5646     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5647   %}
 5648   ins_pipe( pipe_slow );
 5649 %}
 5650 
 5651 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5652   predicate((UseAVX > 0) &&
 5653             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5654   match(Set dst (AddVB src (LoadVector mem)));
 5655   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5656   ins_encode %{
 5657     int vlen_enc = vector_length_encoding(this);
 5658     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5659   %}
 5660   ins_pipe( pipe_slow );
 5661 %}
 5662 
 5663 // Shorts/Chars vector add
 5664 instruct vaddS(vec dst, vec src) %{
 5665   predicate(UseAVX == 0);
 5666   match(Set dst (AddVS dst src));
 5667   format %{ "paddw   $dst,$src\t! add packedS" %}
 5668   ins_encode %{
 5669     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5670   %}
 5671   ins_pipe( pipe_slow );
 5672 %}
 5673 
 5674 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5675   predicate(UseAVX > 0);
 5676   match(Set dst (AddVS src1 src2));
 5677   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5678   ins_encode %{
 5679     int vlen_enc = vector_length_encoding(this);
 5680     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5681   %}
 5682   ins_pipe( pipe_slow );
 5683 %}
 5684 
 5685 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5686   predicate((UseAVX > 0) &&
 5687             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5688   match(Set dst (AddVS src (LoadVector mem)));
 5689   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5690   ins_encode %{
 5691     int vlen_enc = vector_length_encoding(this);
 5692     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5693   %}
 5694   ins_pipe( pipe_slow );
 5695 %}
 5696 
 5697 // Integers vector add
 5698 instruct vaddI(vec dst, vec src) %{
 5699   predicate(UseAVX == 0);
 5700   match(Set dst (AddVI dst src));
 5701   format %{ "paddd   $dst,$src\t! add packedI" %}
 5702   ins_encode %{
 5703     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5704   %}
 5705   ins_pipe( pipe_slow );
 5706 %}
 5707 
 5708 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5709   predicate(UseAVX > 0);
 5710   match(Set dst (AddVI src1 src2));
 5711   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5712   ins_encode %{
 5713     int vlen_enc = vector_length_encoding(this);
 5714     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5715   %}
 5716   ins_pipe( pipe_slow );
 5717 %}
 5718 
 5719 
 5720 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5721   predicate((UseAVX > 0) &&
 5722             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5723   match(Set dst (AddVI src (LoadVector mem)));
 5724   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5725   ins_encode %{
 5726     int vlen_enc = vector_length_encoding(this);
 5727     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5728   %}
 5729   ins_pipe( pipe_slow );
 5730 %}
 5731 
 5732 // Longs vector add
 5733 instruct vaddL(vec dst, vec src) %{
 5734   predicate(UseAVX == 0);
 5735   match(Set dst (AddVL dst src));
 5736   format %{ "paddq   $dst,$src\t! add packedL" %}
 5737   ins_encode %{
 5738     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5739   %}
 5740   ins_pipe( pipe_slow );
 5741 %}
 5742 
 5743 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5744   predicate(UseAVX > 0);
 5745   match(Set dst (AddVL src1 src2));
 5746   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5747   ins_encode %{
 5748     int vlen_enc = vector_length_encoding(this);
 5749     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5750   %}
 5751   ins_pipe( pipe_slow );
 5752 %}
 5753 
 5754 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5755   predicate((UseAVX > 0) &&
 5756             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5757   match(Set dst (AddVL src (LoadVector mem)));
 5758   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5759   ins_encode %{
 5760     int vlen_enc = vector_length_encoding(this);
 5761     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5762   %}
 5763   ins_pipe( pipe_slow );
 5764 %}
 5765 
 5766 // Floats vector add
 5767 instruct vaddF(vec dst, vec src) %{
 5768   predicate(UseAVX == 0);
 5769   match(Set dst (AddVF dst src));
 5770   format %{ "addps   $dst,$src\t! add packedF" %}
 5771   ins_encode %{
 5772     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5773   %}
 5774   ins_pipe( pipe_slow );
 5775 %}
 5776 
 5777 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5778   predicate(UseAVX > 0);
 5779   match(Set dst (AddVF src1 src2));
 5780   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5781   ins_encode %{
 5782     int vlen_enc = vector_length_encoding(this);
 5783     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5784   %}
 5785   ins_pipe( pipe_slow );
 5786 %}
 5787 
 5788 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5789   predicate((UseAVX > 0) &&
 5790             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5791   match(Set dst (AddVF src (LoadVector mem)));
 5792   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5793   ins_encode %{
 5794     int vlen_enc = vector_length_encoding(this);
 5795     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5796   %}
 5797   ins_pipe( pipe_slow );
 5798 %}
 5799 
 5800 // Doubles vector add
 5801 instruct vaddD(vec dst, vec src) %{
 5802   predicate(UseAVX == 0);
 5803   match(Set dst (AddVD dst src));
 5804   format %{ "addpd   $dst,$src\t! add packedD" %}
 5805   ins_encode %{
 5806     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5807   %}
 5808   ins_pipe( pipe_slow );
 5809 %}
 5810 
 5811 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5812   predicate(UseAVX > 0);
 5813   match(Set dst (AddVD src1 src2));
 5814   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5815   ins_encode %{
 5816     int vlen_enc = vector_length_encoding(this);
 5817     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5818   %}
 5819   ins_pipe( pipe_slow );
 5820 %}
 5821 
 5822 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5823   predicate((UseAVX > 0) &&
 5824             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5825   match(Set dst (AddVD src (LoadVector mem)));
 5826   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5827   ins_encode %{
 5828     int vlen_enc = vector_length_encoding(this);
 5829     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5830   %}
 5831   ins_pipe( pipe_slow );
 5832 %}
 5833 
 5834 // --------------------------------- SUB --------------------------------------
 5835 
 5836 // Bytes vector sub
 5837 instruct vsubB(vec dst, vec src) %{
 5838   predicate(UseAVX == 0);
 5839   match(Set dst (SubVB dst src));
 5840   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5841   ins_encode %{
 5842     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5843   %}
 5844   ins_pipe( pipe_slow );
 5845 %}
 5846 
 5847 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5848   predicate(UseAVX > 0);
 5849   match(Set dst (SubVB src1 src2));
 5850   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5851   ins_encode %{
 5852     int vlen_enc = vector_length_encoding(this);
 5853     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5854   %}
 5855   ins_pipe( pipe_slow );
 5856 %}
 5857 
 5858 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5859   predicate((UseAVX > 0) &&
 5860             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5861   match(Set dst (SubVB src (LoadVector mem)));
 5862   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5863   ins_encode %{
 5864     int vlen_enc = vector_length_encoding(this);
 5865     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5866   %}
 5867   ins_pipe( pipe_slow );
 5868 %}
 5869 
 5870 // Shorts/Chars vector sub
 5871 instruct vsubS(vec dst, vec src) %{
 5872   predicate(UseAVX == 0);
 5873   match(Set dst (SubVS dst src));
 5874   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5875   ins_encode %{
 5876     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5877   %}
 5878   ins_pipe( pipe_slow );
 5879 %}
 5880 
 5881 
 5882 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5883   predicate(UseAVX > 0);
 5884   match(Set dst (SubVS src1 src2));
 5885   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5886   ins_encode %{
 5887     int vlen_enc = vector_length_encoding(this);
 5888     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5889   %}
 5890   ins_pipe( pipe_slow );
 5891 %}
 5892 
 5893 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5894   predicate((UseAVX > 0) &&
 5895             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5896   match(Set dst (SubVS src (LoadVector mem)));
 5897   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5898   ins_encode %{
 5899     int vlen_enc = vector_length_encoding(this);
 5900     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5901   %}
 5902   ins_pipe( pipe_slow );
 5903 %}
 5904 
 5905 // Integers vector sub
 5906 instruct vsubI(vec dst, vec src) %{
 5907   predicate(UseAVX == 0);
 5908   match(Set dst (SubVI dst src));
 5909   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5910   ins_encode %{
 5911     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5912   %}
 5913   ins_pipe( pipe_slow );
 5914 %}
 5915 
 5916 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5917   predicate(UseAVX > 0);
 5918   match(Set dst (SubVI src1 src2));
 5919   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5920   ins_encode %{
 5921     int vlen_enc = vector_length_encoding(this);
 5922     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5923   %}
 5924   ins_pipe( pipe_slow );
 5925 %}
 5926 
 5927 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5928   predicate((UseAVX > 0) &&
 5929             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5930   match(Set dst (SubVI src (LoadVector mem)));
 5931   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5932   ins_encode %{
 5933     int vlen_enc = vector_length_encoding(this);
 5934     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5935   %}
 5936   ins_pipe( pipe_slow );
 5937 %}
 5938 
 5939 // Longs vector sub
 5940 instruct vsubL(vec dst, vec src) %{
 5941   predicate(UseAVX == 0);
 5942   match(Set dst (SubVL dst src));
 5943   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5944   ins_encode %{
 5945     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5946   %}
 5947   ins_pipe( pipe_slow );
 5948 %}
 5949 
 5950 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5951   predicate(UseAVX > 0);
 5952   match(Set dst (SubVL src1 src2));
 5953   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5954   ins_encode %{
 5955     int vlen_enc = vector_length_encoding(this);
 5956     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5957   %}
 5958   ins_pipe( pipe_slow );
 5959 %}
 5960 
 5961 
 5962 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5963   predicate((UseAVX > 0) &&
 5964             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5965   match(Set dst (SubVL src (LoadVector mem)));
 5966   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5967   ins_encode %{
 5968     int vlen_enc = vector_length_encoding(this);
 5969     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5970   %}
 5971   ins_pipe( pipe_slow );
 5972 %}
 5973 
 5974 // Floats vector sub
 5975 instruct vsubF(vec dst, vec src) %{
 5976   predicate(UseAVX == 0);
 5977   match(Set dst (SubVF dst src));
 5978   format %{ "subps   $dst,$src\t! sub packedF" %}
 5979   ins_encode %{
 5980     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5981   %}
 5982   ins_pipe( pipe_slow );
 5983 %}
 5984 
 5985 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5986   predicate(UseAVX > 0);
 5987   match(Set dst (SubVF src1 src2));
 5988   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5989   ins_encode %{
 5990     int vlen_enc = vector_length_encoding(this);
 5991     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5992   %}
 5993   ins_pipe( pipe_slow );
 5994 %}
 5995 
 5996 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5997   predicate((UseAVX > 0) &&
 5998             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5999   match(Set dst (SubVF src (LoadVector mem)));
 6000   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 6001   ins_encode %{
 6002     int vlen_enc = vector_length_encoding(this);
 6003     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6004   %}
 6005   ins_pipe( pipe_slow );
 6006 %}
 6007 
 6008 // Doubles vector sub
 6009 instruct vsubD(vec dst, vec src) %{
 6010   predicate(UseAVX == 0);
 6011   match(Set dst (SubVD dst src));
 6012   format %{ "subpd   $dst,$src\t! sub packedD" %}
 6013   ins_encode %{
 6014     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 6015   %}
 6016   ins_pipe( pipe_slow );
 6017 %}
 6018 
 6019 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 6020   predicate(UseAVX > 0);
 6021   match(Set dst (SubVD src1 src2));
 6022   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 6023   ins_encode %{
 6024     int vlen_enc = vector_length_encoding(this);
 6025     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6026   %}
 6027   ins_pipe( pipe_slow );
 6028 %}
 6029 
 6030 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 6031   predicate((UseAVX > 0) &&
 6032             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6033   match(Set dst (SubVD src (LoadVector mem)));
 6034   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6035   ins_encode %{
 6036     int vlen_enc = vector_length_encoding(this);
 6037     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6038   %}
 6039   ins_pipe( pipe_slow );
 6040 %}
 6041 
 6042 // --------------------------------- MUL --------------------------------------
 6043 
 6044 // Byte vector mul
 6045 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6046   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6047   match(Set dst (MulVB src1 src2));
 6048   effect(TEMP dst, TEMP xtmp);
 6049   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6050   ins_encode %{
 6051     assert(UseSSE > 3, "required");
 6052     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6053     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6054     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6055     __ psllw($dst$$XMMRegister, 8);
 6056     __ psrlw($dst$$XMMRegister, 8);
 6057     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6058   %}
 6059   ins_pipe( pipe_slow );
 6060 %}
 6061 
 6062 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6063   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6064   match(Set dst (MulVB src1 src2));
 6065   effect(TEMP dst, TEMP xtmp);
 6066   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6067   ins_encode %{
 6068     assert(UseSSE > 3, "required");
 6069     // Odd-index elements
 6070     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6071     __ psrlw($dst$$XMMRegister, 8);
 6072     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6073     __ psrlw($xtmp$$XMMRegister, 8);
 6074     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6075     __ psllw($dst$$XMMRegister, 8);
 6076     // Even-index elements
 6077     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6078     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6079     __ psllw($xtmp$$XMMRegister, 8);
 6080     __ psrlw($xtmp$$XMMRegister, 8);
 6081     // Combine
 6082     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6083   %}
 6084   ins_pipe( pipe_slow );
 6085 %}
 6086 
 6087 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6088   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6089   match(Set dst (MulVB src1 src2));
 6090   effect(TEMP xtmp1, TEMP xtmp2);
 6091   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6092   ins_encode %{
 6093     int vlen_enc = vector_length_encoding(this);
 6094     // Odd-index elements
 6095     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6096     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6097     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6098     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6099     // Even-index elements
 6100     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6101     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6102     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6103     // Combine
 6104     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6105   %}
 6106   ins_pipe( pipe_slow );
 6107 %}
 6108 
 6109 // Shorts/Chars vector mul
 6110 instruct vmulS(vec dst, vec src) %{
 6111   predicate(UseAVX == 0);
 6112   match(Set dst (MulVS dst src));
 6113   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6114   ins_encode %{
 6115     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6116   %}
 6117   ins_pipe( pipe_slow );
 6118 %}
 6119 
 6120 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6121   predicate(UseAVX > 0);
 6122   match(Set dst (MulVS src1 src2));
 6123   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6124   ins_encode %{
 6125     int vlen_enc = vector_length_encoding(this);
 6126     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6127   %}
 6128   ins_pipe( pipe_slow );
 6129 %}
 6130 
 6131 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6132   predicate((UseAVX > 0) &&
 6133             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6134   match(Set dst (MulVS src (LoadVector mem)));
 6135   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6136   ins_encode %{
 6137     int vlen_enc = vector_length_encoding(this);
 6138     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6139   %}
 6140   ins_pipe( pipe_slow );
 6141 %}
 6142 
 6143 // Integers vector mul
 6144 instruct vmulI(vec dst, vec src) %{
 6145   predicate(UseAVX == 0);
 6146   match(Set dst (MulVI dst src));
 6147   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6148   ins_encode %{
 6149     assert(UseSSE > 3, "required");
 6150     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6151   %}
 6152   ins_pipe( pipe_slow );
 6153 %}
 6154 
 6155 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6156   predicate(UseAVX > 0);
 6157   match(Set dst (MulVI src1 src2));
 6158   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6159   ins_encode %{
 6160     int vlen_enc = vector_length_encoding(this);
 6161     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6162   %}
 6163   ins_pipe( pipe_slow );
 6164 %}
 6165 
 6166 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6167   predicate((UseAVX > 0) &&
 6168             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6169   match(Set dst (MulVI src (LoadVector mem)));
 6170   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6171   ins_encode %{
 6172     int vlen_enc = vector_length_encoding(this);
 6173     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6174   %}
 6175   ins_pipe( pipe_slow );
 6176 %}
 6177 
 6178 // Longs vector mul
 6179 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6180   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6181              VM_Version::supports_avx512dq()) ||
 6182             VM_Version::supports_avx512vldq());
 6183   match(Set dst (MulVL src1 src2));
 6184   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6185   ins_encode %{
 6186     assert(UseAVX > 2, "required");
 6187     int vlen_enc = vector_length_encoding(this);
 6188     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6189   %}
 6190   ins_pipe( pipe_slow );
 6191 %}
 6192 
 6193 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6194   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6195              VM_Version::supports_avx512dq()) ||
 6196             (Matcher::vector_length_in_bytes(n) > 8 &&
 6197              VM_Version::supports_avx512vldq()));
 6198   match(Set dst (MulVL src (LoadVector mem)));
 6199   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6200   ins_encode %{
 6201     assert(UseAVX > 2, "required");
 6202     int vlen_enc = vector_length_encoding(this);
 6203     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6204   %}
 6205   ins_pipe( pipe_slow );
 6206 %}
 6207 
 6208 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6209   predicate(UseAVX == 0);
 6210   match(Set dst (MulVL src1 src2));
 6211   effect(TEMP dst, TEMP xtmp);
 6212   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6213   ins_encode %{
 6214     assert(VM_Version::supports_sse4_1(), "required");
 6215     // Get the lo-hi products, only the lower 32 bits is in concerns
 6216     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6217     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6218     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6219     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6220     __ psllq($dst$$XMMRegister, 32);
 6221     // Get the lo-lo products
 6222     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6223     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6224     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6225   %}
 6226   ins_pipe( pipe_slow );
 6227 %}
 6228 
 6229 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6230   predicate(UseAVX > 0 &&
 6231             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6232               !VM_Version::supports_avx512dq()) ||
 6233              (Matcher::vector_length_in_bytes(n) < 64 &&
 6234               !VM_Version::supports_avx512vldq())));
 6235   match(Set dst (MulVL src1 src2));
 6236   effect(TEMP xtmp1, TEMP xtmp2);
 6237   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6238   ins_encode %{
 6239     int vlen_enc = vector_length_encoding(this);
 6240     // Get the lo-hi products, only the lower 32 bits is in concerns
 6241     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6242     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6243     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6244     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6245     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6246     // Get the lo-lo products
 6247     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6248     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6249   %}
 6250   ins_pipe( pipe_slow );
 6251 %}
 6252 
 6253 // Floats vector mul
 6254 instruct vmulF(vec dst, vec src) %{
 6255   predicate(UseAVX == 0);
 6256   match(Set dst (MulVF dst src));
 6257   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6258   ins_encode %{
 6259     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6260   %}
 6261   ins_pipe( pipe_slow );
 6262 %}
 6263 
 6264 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6265   predicate(UseAVX > 0);
 6266   match(Set dst (MulVF src1 src2));
 6267   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6268   ins_encode %{
 6269     int vlen_enc = vector_length_encoding(this);
 6270     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6271   %}
 6272   ins_pipe( pipe_slow );
 6273 %}
 6274 
 6275 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6276   predicate((UseAVX > 0) &&
 6277             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6278   match(Set dst (MulVF src (LoadVector mem)));
 6279   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6280   ins_encode %{
 6281     int vlen_enc = vector_length_encoding(this);
 6282     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6283   %}
 6284   ins_pipe( pipe_slow );
 6285 %}
 6286 
 6287 // Doubles vector mul
 6288 instruct vmulD(vec dst, vec src) %{
 6289   predicate(UseAVX == 0);
 6290   match(Set dst (MulVD dst src));
 6291   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6292   ins_encode %{
 6293     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6294   %}
 6295   ins_pipe( pipe_slow );
 6296 %}
 6297 
 6298 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6299   predicate(UseAVX > 0);
 6300   match(Set dst (MulVD src1 src2));
 6301   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6302   ins_encode %{
 6303     int vlen_enc = vector_length_encoding(this);
 6304     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6305   %}
 6306   ins_pipe( pipe_slow );
 6307 %}
 6308 
 6309 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6310   predicate((UseAVX > 0) &&
 6311             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6312   match(Set dst (MulVD src (LoadVector mem)));
 6313   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6314   ins_encode %{
 6315     int vlen_enc = vector_length_encoding(this);
 6316     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6317   %}
 6318   ins_pipe( pipe_slow );
 6319 %}
 6320 
 6321 // --------------------------------- DIV --------------------------------------
 6322 
 6323 // Floats vector div
 6324 instruct vdivF(vec dst, vec src) %{
 6325   predicate(UseAVX == 0);
 6326   match(Set dst (DivVF dst src));
 6327   format %{ "divps   $dst,$src\t! div packedF" %}
 6328   ins_encode %{
 6329     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6330   %}
 6331   ins_pipe( pipe_slow );
 6332 %}
 6333 
 6334 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6335   predicate(UseAVX > 0);
 6336   match(Set dst (DivVF src1 src2));
 6337   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6338   ins_encode %{
 6339     int vlen_enc = vector_length_encoding(this);
 6340     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6341   %}
 6342   ins_pipe( pipe_slow );
 6343 %}
 6344 
 6345 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6346   predicate((UseAVX > 0) &&
 6347             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6348   match(Set dst (DivVF src (LoadVector mem)));
 6349   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6350   ins_encode %{
 6351     int vlen_enc = vector_length_encoding(this);
 6352     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6353   %}
 6354   ins_pipe( pipe_slow );
 6355 %}
 6356 
 6357 // Doubles vector div
 6358 instruct vdivD(vec dst, vec src) %{
 6359   predicate(UseAVX == 0);
 6360   match(Set dst (DivVD dst src));
 6361   format %{ "divpd   $dst,$src\t! div packedD" %}
 6362   ins_encode %{
 6363     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6364   %}
 6365   ins_pipe( pipe_slow );
 6366 %}
 6367 
 6368 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6369   predicate(UseAVX > 0);
 6370   match(Set dst (DivVD src1 src2));
 6371   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6372   ins_encode %{
 6373     int vlen_enc = vector_length_encoding(this);
 6374     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6375   %}
 6376   ins_pipe( pipe_slow );
 6377 %}
 6378 
 6379 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6380   predicate((UseAVX > 0) &&
 6381             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6382   match(Set dst (DivVD src (LoadVector mem)));
 6383   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6384   ins_encode %{
 6385     int vlen_enc = vector_length_encoding(this);
 6386     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6387   %}
 6388   ins_pipe( pipe_slow );
 6389 %}
 6390 
 6391 // ------------------------------ MinMax ---------------------------------------
 6392 
 6393 // Byte, Short, Int vector Min/Max
 6394 instruct minmax_reg_sse(vec dst, vec src) %{
 6395   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6396             UseAVX == 0);
 6397   match(Set dst (MinV dst src));
 6398   match(Set dst (MaxV dst src));
 6399   format %{ "vector_minmax  $dst,$src\t!  " %}
 6400   ins_encode %{
 6401     assert(UseSSE >= 4, "required");
 6402 
 6403     int opcode = this->ideal_Opcode();
 6404     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6405     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6406   %}
 6407   ins_pipe( pipe_slow );
 6408 %}
 6409 
 6410 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6411   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6412             UseAVX > 0);
 6413   match(Set dst (MinV src1 src2));
 6414   match(Set dst (MaxV src1 src2));
 6415   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6416   ins_encode %{
 6417     int opcode = this->ideal_Opcode();
 6418     int vlen_enc = vector_length_encoding(this);
 6419     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6420 
 6421     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6422   %}
 6423   ins_pipe( pipe_slow );
 6424 %}
 6425 
 6426 // Long vector Min/Max
 6427 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6428   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6429             UseAVX == 0);
 6430   match(Set dst (MinV dst src));
 6431   match(Set dst (MaxV src dst));
 6432   effect(TEMP dst, TEMP tmp);
 6433   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6434   ins_encode %{
 6435     assert(UseSSE >= 4, "required");
 6436 
 6437     int opcode = this->ideal_Opcode();
 6438     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6439     assert(elem_bt == T_LONG, "sanity");
 6440 
 6441     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6442   %}
 6443   ins_pipe( pipe_slow );
 6444 %}
 6445 
 6446 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6447   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6448             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6449   match(Set dst (MinV src1 src2));
 6450   match(Set dst (MaxV src1 src2));
 6451   effect(TEMP dst);
 6452   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6453   ins_encode %{
 6454     int vlen_enc = vector_length_encoding(this);
 6455     int opcode = this->ideal_Opcode();
 6456     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6457     assert(elem_bt == T_LONG, "sanity");
 6458 
 6459     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6460   %}
 6461   ins_pipe( pipe_slow );
 6462 %}
 6463 
 6464 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6465   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6466             Matcher::vector_element_basic_type(n) == T_LONG);
 6467   match(Set dst (MinV src1 src2));
 6468   match(Set dst (MaxV src1 src2));
 6469   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6470   ins_encode %{
 6471     assert(UseAVX > 2, "required");
 6472 
 6473     int vlen_enc = vector_length_encoding(this);
 6474     int opcode = this->ideal_Opcode();
 6475     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6476     assert(elem_bt == T_LONG, "sanity");
 6477 
 6478     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6479   %}
 6480   ins_pipe( pipe_slow );
 6481 %}
 6482 
 6483 // Float/Double vector Min/Max
 6484 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6485   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6486             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6487             UseAVX > 0);
 6488   match(Set dst (MinV a b));
 6489   match(Set dst (MaxV a b));
 6490   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6491   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6492   ins_encode %{
 6493     assert(UseAVX > 0, "required");
 6494 
 6495     int opcode = this->ideal_Opcode();
 6496     int vlen_enc = vector_length_encoding(this);
 6497     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6498 
 6499     __ vminmax_fp(opcode, elem_bt,
 6500                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6501                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6502   %}
 6503   ins_pipe( pipe_slow );
 6504 %}
 6505 
 6506 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6507   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6508             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6509   match(Set dst (MinV a b));
 6510   match(Set dst (MaxV a b));
 6511   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6512   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6513   ins_encode %{
 6514     assert(UseAVX > 2, "required");
 6515 
 6516     int opcode = this->ideal_Opcode();
 6517     int vlen_enc = vector_length_encoding(this);
 6518     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6519 
 6520     __ evminmax_fp(opcode, elem_bt,
 6521                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6522                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6523   %}
 6524   ins_pipe( pipe_slow );
 6525 %}
 6526 
 6527 // --------------------------------- Signum/CopySign ---------------------------
 6528 
 6529 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6530   match(Set dst (SignumF dst (Binary zero one)));
 6531   effect(KILL cr);
 6532   format %{ "signumF $dst, $dst" %}
 6533   ins_encode %{
 6534     int opcode = this->ideal_Opcode();
 6535     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6536   %}
 6537   ins_pipe( pipe_slow );
 6538 %}
 6539 
 6540 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6541   match(Set dst (SignumD dst (Binary zero one)));
 6542   effect(KILL cr);
 6543   format %{ "signumD $dst, $dst" %}
 6544   ins_encode %{
 6545     int opcode = this->ideal_Opcode();
 6546     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6547   %}
 6548   ins_pipe( pipe_slow );
 6549 %}
 6550 
 6551 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6552   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6553   match(Set dst (SignumVF src (Binary zero one)));
 6554   match(Set dst (SignumVD src (Binary zero one)));
 6555   effect(TEMP dst, TEMP xtmp1);
 6556   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6557   ins_encode %{
 6558     int opcode = this->ideal_Opcode();
 6559     int vec_enc = vector_length_encoding(this);
 6560     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6561                          $xtmp1$$XMMRegister, vec_enc);
 6562   %}
 6563   ins_pipe( pipe_slow );
 6564 %}
 6565 
 6566 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6567   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6568   match(Set dst (SignumVF src (Binary zero one)));
 6569   match(Set dst (SignumVD src (Binary zero one)));
 6570   effect(TEMP dst, TEMP ktmp1);
 6571   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6572   ins_encode %{
 6573     int opcode = this->ideal_Opcode();
 6574     int vec_enc = vector_length_encoding(this);
 6575     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6576                           $ktmp1$$KRegister, vec_enc);
 6577   %}
 6578   ins_pipe( pipe_slow );
 6579 %}
 6580 
 6581 // ---------------------------------------
 6582 // For copySign use 0xE4 as writemask for vpternlog
 6583 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6584 // C (xmm2) is set to 0x7FFFFFFF
 6585 // Wherever xmm2 is 0, we want to pick from B (sign)
 6586 // Wherever xmm2 is 1, we want to pick from A (src)
 6587 //
 6588 // A B C Result
 6589 // 0 0 0 0
 6590 // 0 0 1 0
 6591 // 0 1 0 1
 6592 // 0 1 1 0
 6593 // 1 0 0 0
 6594 // 1 0 1 1
 6595 // 1 1 0 1
 6596 // 1 1 1 1
 6597 //
 6598 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6599 // ---------------------------------------
 6600 
 6601 #ifdef _LP64
 6602 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6603   match(Set dst (CopySignF dst src));
 6604   effect(TEMP tmp1, TEMP tmp2);
 6605   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6606   ins_encode %{
 6607     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6608     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6609     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6610   %}
 6611   ins_pipe( pipe_slow );
 6612 %}
 6613 
 6614 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6615   match(Set dst (CopySignD dst (Binary src zero)));
 6616   ins_cost(100);
 6617   effect(TEMP tmp1, TEMP tmp2);
 6618   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6619   ins_encode %{
 6620     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6621     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6622     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6623   %}
 6624   ins_pipe( pipe_slow );
 6625 %}
 6626 
 6627 #endif // _LP64
 6628 
 6629 //----------------------------- CompressBits/ExpandBits ------------------------
 6630 
 6631 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6632   predicate(n->bottom_type()->isa_int());
 6633   match(Set dst (CompressBits src mask));
 6634   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6635   ins_encode %{
 6636     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6637   %}
 6638   ins_pipe( pipe_slow );
 6639 %}
 6640 
 6641 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6642   predicate(n->bottom_type()->isa_int());
 6643   match(Set dst (ExpandBits src mask));
 6644   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6645   ins_encode %{
 6646     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6647   %}
 6648   ins_pipe( pipe_slow );
 6649 %}
 6650 
 6651 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6652   predicate(n->bottom_type()->isa_int());
 6653   match(Set dst (CompressBits src (LoadI mask)));
 6654   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6655   ins_encode %{
 6656     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6657   %}
 6658   ins_pipe( pipe_slow );
 6659 %}
 6660 
 6661 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6662   predicate(n->bottom_type()->isa_int());
 6663   match(Set dst (ExpandBits src (LoadI mask)));
 6664   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6665   ins_encode %{
 6666     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6667   %}
 6668   ins_pipe( pipe_slow );
 6669 %}
 6670 
 6671 // --------------------------------- Sqrt --------------------------------------
 6672 
 6673 instruct vsqrtF_reg(vec dst, vec src) %{
 6674   match(Set dst (SqrtVF src));
 6675   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6676   ins_encode %{
 6677     assert(UseAVX > 0, "required");
 6678     int vlen_enc = vector_length_encoding(this);
 6679     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6680   %}
 6681   ins_pipe( pipe_slow );
 6682 %}
 6683 
 6684 instruct vsqrtF_mem(vec dst, memory mem) %{
 6685   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6686   match(Set dst (SqrtVF (LoadVector mem)));
 6687   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6688   ins_encode %{
 6689     assert(UseAVX > 0, "required");
 6690     int vlen_enc = vector_length_encoding(this);
 6691     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6692   %}
 6693   ins_pipe( pipe_slow );
 6694 %}
 6695 
 6696 // Floating point vector sqrt
 6697 instruct vsqrtD_reg(vec dst, vec src) %{
 6698   match(Set dst (SqrtVD src));
 6699   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6700   ins_encode %{
 6701     assert(UseAVX > 0, "required");
 6702     int vlen_enc = vector_length_encoding(this);
 6703     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6704   %}
 6705   ins_pipe( pipe_slow );
 6706 %}
 6707 
 6708 instruct vsqrtD_mem(vec dst, memory mem) %{
 6709   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6710   match(Set dst (SqrtVD (LoadVector mem)));
 6711   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6712   ins_encode %{
 6713     assert(UseAVX > 0, "required");
 6714     int vlen_enc = vector_length_encoding(this);
 6715     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6716   %}
 6717   ins_pipe( pipe_slow );
 6718 %}
 6719 
 6720 // ------------------------------ Shift ---------------------------------------
 6721 
 6722 // Left and right shift count vectors are the same on x86
 6723 // (only lowest bits of xmm reg are used for count).
 6724 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6725   match(Set dst (LShiftCntV cnt));
 6726   match(Set dst (RShiftCntV cnt));
 6727   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6728   ins_encode %{
 6729     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6730   %}
 6731   ins_pipe( pipe_slow );
 6732 %}
 6733 
 6734 // Byte vector shift
 6735 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6736   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6737   match(Set dst ( LShiftVB src shift));
 6738   match(Set dst ( RShiftVB src shift));
 6739   match(Set dst (URShiftVB src shift));
 6740   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6741   format %{"vector_byte_shift $dst,$src,$shift" %}
 6742   ins_encode %{
 6743     assert(UseSSE > 3, "required");
 6744     int opcode = this->ideal_Opcode();
 6745     bool sign = (opcode != Op_URShiftVB);
 6746     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6747     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6748     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6749     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6750     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6751   %}
 6752   ins_pipe( pipe_slow );
 6753 %}
 6754 
 6755 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6756   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6757             UseAVX <= 1);
 6758   match(Set dst ( LShiftVB src shift));
 6759   match(Set dst ( RShiftVB src shift));
 6760   match(Set dst (URShiftVB src shift));
 6761   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6762   format %{"vector_byte_shift $dst,$src,$shift" %}
 6763   ins_encode %{
 6764     assert(UseSSE > 3, "required");
 6765     int opcode = this->ideal_Opcode();
 6766     bool sign = (opcode != Op_URShiftVB);
 6767     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6768     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6769     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6770     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6771     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6772     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6773     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6774     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6775     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6776   %}
 6777   ins_pipe( pipe_slow );
 6778 %}
 6779 
 6780 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6781   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6782             UseAVX > 1);
 6783   match(Set dst ( LShiftVB src shift));
 6784   match(Set dst ( RShiftVB src shift));
 6785   match(Set dst (URShiftVB src shift));
 6786   effect(TEMP dst, TEMP tmp);
 6787   format %{"vector_byte_shift $dst,$src,$shift" %}
 6788   ins_encode %{
 6789     int opcode = this->ideal_Opcode();
 6790     bool sign = (opcode != Op_URShiftVB);
 6791     int vlen_enc = Assembler::AVX_256bit;
 6792     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6793     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6794     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6795     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6796     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6797   %}
 6798   ins_pipe( pipe_slow );
 6799 %}
 6800 
 6801 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6802   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6803   match(Set dst ( LShiftVB src shift));
 6804   match(Set dst ( RShiftVB src shift));
 6805   match(Set dst (URShiftVB src shift));
 6806   effect(TEMP dst, TEMP tmp);
 6807   format %{"vector_byte_shift $dst,$src,$shift" %}
 6808   ins_encode %{
 6809     assert(UseAVX > 1, "required");
 6810     int opcode = this->ideal_Opcode();
 6811     bool sign = (opcode != Op_URShiftVB);
 6812     int vlen_enc = Assembler::AVX_256bit;
 6813     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6814     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6815     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6816     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6817     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6818     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6819     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6820     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6821     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6822   %}
 6823   ins_pipe( pipe_slow );
 6824 %}
 6825 
 6826 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6827   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6828   match(Set dst ( LShiftVB src shift));
 6829   match(Set dst  (RShiftVB src shift));
 6830   match(Set dst (URShiftVB src shift));
 6831   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6832   format %{"vector_byte_shift $dst,$src,$shift" %}
 6833   ins_encode %{
 6834     assert(UseAVX > 2, "required");
 6835     int opcode = this->ideal_Opcode();
 6836     bool sign = (opcode != Op_URShiftVB);
 6837     int vlen_enc = Assembler::AVX_512bit;
 6838     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6839     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6840     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6841     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6842     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6843     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6844     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6845     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6846     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6847     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6848     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6849     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6850   %}
 6851   ins_pipe( pipe_slow );
 6852 %}
 6853 
 6854 // Shorts vector logical right shift produces incorrect Java result
 6855 // for negative data because java code convert short value into int with
 6856 // sign extension before a shift. But char vectors are fine since chars are
 6857 // unsigned values.
 6858 // Shorts/Chars vector left shift
 6859 instruct vshiftS(vec dst, vec src, vec shift) %{
 6860   predicate(!n->as_ShiftV()->is_var_shift());
 6861   match(Set dst ( LShiftVS src shift));
 6862   match(Set dst ( RShiftVS src shift));
 6863   match(Set dst (URShiftVS src shift));
 6864   effect(TEMP dst, USE src, USE shift);
 6865   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6866   ins_encode %{
 6867     int opcode = this->ideal_Opcode();
 6868     if (UseAVX > 0) {
 6869       int vlen_enc = vector_length_encoding(this);
 6870       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6871     } else {
 6872       int vlen = Matcher::vector_length(this);
 6873       if (vlen == 2) {
 6874         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6875         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6876       } else if (vlen == 4) {
 6877         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6878         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6879       } else {
 6880         assert (vlen == 8, "sanity");
 6881         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6882         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6883       }
 6884     }
 6885   %}
 6886   ins_pipe( pipe_slow );
 6887 %}
 6888 
 6889 // Integers vector left shift
 6890 instruct vshiftI(vec dst, vec src, vec shift) %{
 6891   predicate(!n->as_ShiftV()->is_var_shift());
 6892   match(Set dst ( LShiftVI src shift));
 6893   match(Set dst ( RShiftVI src shift));
 6894   match(Set dst (URShiftVI src shift));
 6895   effect(TEMP dst, USE src, USE shift);
 6896   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6897   ins_encode %{
 6898     int opcode = this->ideal_Opcode();
 6899     if (UseAVX > 0) {
 6900       int vlen_enc = vector_length_encoding(this);
 6901       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6902     } else {
 6903       int vlen = Matcher::vector_length(this);
 6904       if (vlen == 2) {
 6905         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6906         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6907       } else {
 6908         assert(vlen == 4, "sanity");
 6909         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6910         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6911       }
 6912     }
 6913   %}
 6914   ins_pipe( pipe_slow );
 6915 %}
 6916 
 6917 // Integers vector left constant shift
 6918 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6919   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6920   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6921   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6922   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6923   ins_encode %{
 6924     int opcode = this->ideal_Opcode();
 6925     if (UseAVX > 0) {
 6926       int vector_len = vector_length_encoding(this);
 6927       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6928     } else {
 6929       int vlen = Matcher::vector_length(this);
 6930       if (vlen == 2) {
 6931         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6932         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6933       } else {
 6934         assert(vlen == 4, "sanity");
 6935         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6936         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6937       }
 6938     }
 6939   %}
 6940   ins_pipe( pipe_slow );
 6941 %}
 6942 
 6943 // Longs vector shift
 6944 instruct vshiftL(vec dst, vec src, vec shift) %{
 6945   predicate(!n->as_ShiftV()->is_var_shift());
 6946   match(Set dst ( LShiftVL src shift));
 6947   match(Set dst (URShiftVL src shift));
 6948   effect(TEMP dst, USE src, USE shift);
 6949   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6950   ins_encode %{
 6951     int opcode = this->ideal_Opcode();
 6952     if (UseAVX > 0) {
 6953       int vlen_enc = vector_length_encoding(this);
 6954       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6955     } else {
 6956       assert(Matcher::vector_length(this) == 2, "");
 6957       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6958       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6959     }
 6960   %}
 6961   ins_pipe( pipe_slow );
 6962 %}
 6963 
 6964 // Longs vector constant shift
 6965 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6966   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6967   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6968   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6969   ins_encode %{
 6970     int opcode = this->ideal_Opcode();
 6971     if (UseAVX > 0) {
 6972       int vector_len = vector_length_encoding(this);
 6973       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6974     } else {
 6975       assert(Matcher::vector_length(this) == 2, "");
 6976       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6977       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6978     }
 6979   %}
 6980   ins_pipe( pipe_slow );
 6981 %}
 6982 
 6983 // -------------------ArithmeticRightShift -----------------------------------
 6984 // Long vector arithmetic right shift
 6985 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6986   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6987   match(Set dst (RShiftVL src shift));
 6988   effect(TEMP dst, TEMP tmp);
 6989   format %{ "vshiftq $dst,$src,$shift" %}
 6990   ins_encode %{
 6991     uint vlen = Matcher::vector_length(this);
 6992     if (vlen == 2) {
 6993       assert(UseSSE >= 2, "required");
 6994       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6995       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6996       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6997       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6998       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6999       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7000     } else {
 7001       assert(vlen == 4, "sanity");
 7002       assert(UseAVX > 1, "required");
 7003       int vlen_enc = Assembler::AVX_256bit;
 7004       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7005       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7006       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7007       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7008       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7009     }
 7010   %}
 7011   ins_pipe( pipe_slow );
 7012 %}
 7013 
 7014 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7015   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7016   match(Set dst (RShiftVL src shift));
 7017   format %{ "vshiftq $dst,$src,$shift" %}
 7018   ins_encode %{
 7019     int vlen_enc = vector_length_encoding(this);
 7020     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7021   %}
 7022   ins_pipe( pipe_slow );
 7023 %}
 7024 
 7025 // ------------------- Variable Shift -----------------------------
 7026 // Byte variable shift
 7027 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7028   predicate(Matcher::vector_length(n) <= 8 &&
 7029             n->as_ShiftV()->is_var_shift() &&
 7030             !VM_Version::supports_avx512bw());
 7031   match(Set dst ( LShiftVB src shift));
 7032   match(Set dst ( RShiftVB src shift));
 7033   match(Set dst (URShiftVB src shift));
 7034   effect(TEMP dst, TEMP vtmp);
 7035   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7036   ins_encode %{
 7037     assert(UseAVX >= 2, "required");
 7038 
 7039     int opcode = this->ideal_Opcode();
 7040     int vlen_enc = Assembler::AVX_128bit;
 7041     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7042     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7043   %}
 7044   ins_pipe( pipe_slow );
 7045 %}
 7046 
 7047 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7048   predicate(Matcher::vector_length(n) == 16 &&
 7049             n->as_ShiftV()->is_var_shift() &&
 7050             !VM_Version::supports_avx512bw());
 7051   match(Set dst ( LShiftVB src shift));
 7052   match(Set dst ( RShiftVB src shift));
 7053   match(Set dst (URShiftVB src shift));
 7054   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7055   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7056   ins_encode %{
 7057     assert(UseAVX >= 2, "required");
 7058 
 7059     int opcode = this->ideal_Opcode();
 7060     int vlen_enc = Assembler::AVX_128bit;
 7061     // Shift lower half and get word result in dst
 7062     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7063 
 7064     // Shift upper half and get word result in vtmp1
 7065     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7066     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7067     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7068 
 7069     // Merge and down convert the two word results to byte in dst
 7070     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7071   %}
 7072   ins_pipe( pipe_slow );
 7073 %}
 7074 
 7075 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7076   predicate(Matcher::vector_length(n) == 32 &&
 7077             n->as_ShiftV()->is_var_shift() &&
 7078             !VM_Version::supports_avx512bw());
 7079   match(Set dst ( LShiftVB src shift));
 7080   match(Set dst ( RShiftVB src shift));
 7081   match(Set dst (URShiftVB src shift));
 7082   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7083   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7084   ins_encode %{
 7085     assert(UseAVX >= 2, "required");
 7086 
 7087     int opcode = this->ideal_Opcode();
 7088     int vlen_enc = Assembler::AVX_128bit;
 7089     // Process lower 128 bits and get result in dst
 7090     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7091     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7092     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7093     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7094     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7095 
 7096     // Process higher 128 bits and get result in vtmp3
 7097     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7098     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7099     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7100     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7101     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7102     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7103     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7104 
 7105     // Merge the two results in dst
 7106     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7107   %}
 7108   ins_pipe( pipe_slow );
 7109 %}
 7110 
 7111 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7112   predicate(Matcher::vector_length(n) <= 32 &&
 7113             n->as_ShiftV()->is_var_shift() &&
 7114             VM_Version::supports_avx512bw());
 7115   match(Set dst ( LShiftVB src shift));
 7116   match(Set dst ( RShiftVB src shift));
 7117   match(Set dst (URShiftVB src shift));
 7118   effect(TEMP dst, TEMP vtmp);
 7119   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7120   ins_encode %{
 7121     assert(UseAVX > 2, "required");
 7122 
 7123     int opcode = this->ideal_Opcode();
 7124     int vlen_enc = vector_length_encoding(this);
 7125     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7126   %}
 7127   ins_pipe( pipe_slow );
 7128 %}
 7129 
 7130 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7131   predicate(Matcher::vector_length(n) == 64 &&
 7132             n->as_ShiftV()->is_var_shift() &&
 7133             VM_Version::supports_avx512bw());
 7134   match(Set dst ( LShiftVB src shift));
 7135   match(Set dst ( RShiftVB src shift));
 7136   match(Set dst (URShiftVB src shift));
 7137   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7138   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7139   ins_encode %{
 7140     assert(UseAVX > 2, "required");
 7141 
 7142     int opcode = this->ideal_Opcode();
 7143     int vlen_enc = Assembler::AVX_256bit;
 7144     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7145     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7146     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7147     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7148     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7149   %}
 7150   ins_pipe( pipe_slow );
 7151 %}
 7152 
 7153 // Short variable shift
 7154 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7155   predicate(Matcher::vector_length(n) <= 8 &&
 7156             n->as_ShiftV()->is_var_shift() &&
 7157             !VM_Version::supports_avx512bw());
 7158   match(Set dst ( LShiftVS src shift));
 7159   match(Set dst ( RShiftVS src shift));
 7160   match(Set dst (URShiftVS src shift));
 7161   effect(TEMP dst, TEMP vtmp);
 7162   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7163   ins_encode %{
 7164     assert(UseAVX >= 2, "required");
 7165 
 7166     int opcode = this->ideal_Opcode();
 7167     bool sign = (opcode != Op_URShiftVS);
 7168     int vlen_enc = Assembler::AVX_256bit;
 7169     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7170     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7171     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7172     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7173     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7174     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7175   %}
 7176   ins_pipe( pipe_slow );
 7177 %}
 7178 
 7179 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7180   predicate(Matcher::vector_length(n) == 16 &&
 7181             n->as_ShiftV()->is_var_shift() &&
 7182             !VM_Version::supports_avx512bw());
 7183   match(Set dst ( LShiftVS src shift));
 7184   match(Set dst ( RShiftVS src shift));
 7185   match(Set dst (URShiftVS src shift));
 7186   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7187   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7188   ins_encode %{
 7189     assert(UseAVX >= 2, "required");
 7190 
 7191     int opcode = this->ideal_Opcode();
 7192     bool sign = (opcode != Op_URShiftVS);
 7193     int vlen_enc = Assembler::AVX_256bit;
 7194     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7195     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7196     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7197     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7198     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7199 
 7200     // Shift upper half, with result in dst using vtmp1 as TEMP
 7201     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7202     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7203     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7204     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7205     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7206     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7207 
 7208     // Merge lower and upper half result into dst
 7209     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7210     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7211   %}
 7212   ins_pipe( pipe_slow );
 7213 %}
 7214 
 7215 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7216   predicate(n->as_ShiftV()->is_var_shift() &&
 7217             VM_Version::supports_avx512bw());
 7218   match(Set dst ( LShiftVS src shift));
 7219   match(Set dst ( RShiftVS src shift));
 7220   match(Set dst (URShiftVS src shift));
 7221   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7222   ins_encode %{
 7223     assert(UseAVX > 2, "required");
 7224 
 7225     int opcode = this->ideal_Opcode();
 7226     int vlen_enc = vector_length_encoding(this);
 7227     if (!VM_Version::supports_avx512vl()) {
 7228       vlen_enc = Assembler::AVX_512bit;
 7229     }
 7230     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7231   %}
 7232   ins_pipe( pipe_slow );
 7233 %}
 7234 
 7235 //Integer variable shift
 7236 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7237   predicate(n->as_ShiftV()->is_var_shift());
 7238   match(Set dst ( LShiftVI src shift));
 7239   match(Set dst ( RShiftVI src shift));
 7240   match(Set dst (URShiftVI src shift));
 7241   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7242   ins_encode %{
 7243     assert(UseAVX >= 2, "required");
 7244 
 7245     int opcode = this->ideal_Opcode();
 7246     int vlen_enc = vector_length_encoding(this);
 7247     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7248   %}
 7249   ins_pipe( pipe_slow );
 7250 %}
 7251 
 7252 //Long variable shift
 7253 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7254   predicate(n->as_ShiftV()->is_var_shift());
 7255   match(Set dst ( LShiftVL src shift));
 7256   match(Set dst (URShiftVL src shift));
 7257   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7258   ins_encode %{
 7259     assert(UseAVX >= 2, "required");
 7260 
 7261     int opcode = this->ideal_Opcode();
 7262     int vlen_enc = vector_length_encoding(this);
 7263     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7264   %}
 7265   ins_pipe( pipe_slow );
 7266 %}
 7267 
 7268 //Long variable right shift arithmetic
 7269 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7270   predicate(Matcher::vector_length(n) <= 4 &&
 7271             n->as_ShiftV()->is_var_shift() &&
 7272             UseAVX == 2);
 7273   match(Set dst (RShiftVL src shift));
 7274   effect(TEMP dst, TEMP vtmp);
 7275   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7276   ins_encode %{
 7277     int opcode = this->ideal_Opcode();
 7278     int vlen_enc = vector_length_encoding(this);
 7279     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7280                  $vtmp$$XMMRegister);
 7281   %}
 7282   ins_pipe( pipe_slow );
 7283 %}
 7284 
 7285 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7286   predicate(n->as_ShiftV()->is_var_shift() &&
 7287             UseAVX > 2);
 7288   match(Set dst (RShiftVL src shift));
 7289   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7290   ins_encode %{
 7291     int opcode = this->ideal_Opcode();
 7292     int vlen_enc = vector_length_encoding(this);
 7293     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7294   %}
 7295   ins_pipe( pipe_slow );
 7296 %}
 7297 
 7298 // --------------------------------- AND --------------------------------------
 7299 
 7300 instruct vand(vec dst, vec src) %{
 7301   predicate(UseAVX == 0);
 7302   match(Set dst (AndV dst src));
 7303   format %{ "pand    $dst,$src\t! and vectors" %}
 7304   ins_encode %{
 7305     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7306   %}
 7307   ins_pipe( pipe_slow );
 7308 %}
 7309 
 7310 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7311   predicate(UseAVX > 0);
 7312   match(Set dst (AndV src1 src2));
 7313   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7314   ins_encode %{
 7315     int vlen_enc = vector_length_encoding(this);
 7316     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7317   %}
 7318   ins_pipe( pipe_slow );
 7319 %}
 7320 
 7321 instruct vand_mem(vec dst, vec src, memory mem) %{
 7322   predicate((UseAVX > 0) &&
 7323             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7324   match(Set dst (AndV src (LoadVector mem)));
 7325   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7326   ins_encode %{
 7327     int vlen_enc = vector_length_encoding(this);
 7328     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7329   %}
 7330   ins_pipe( pipe_slow );
 7331 %}
 7332 
 7333 // --------------------------------- OR ---------------------------------------
 7334 
 7335 instruct vor(vec dst, vec src) %{
 7336   predicate(UseAVX == 0);
 7337   match(Set dst (OrV dst src));
 7338   format %{ "por     $dst,$src\t! or vectors" %}
 7339   ins_encode %{
 7340     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7341   %}
 7342   ins_pipe( pipe_slow );
 7343 %}
 7344 
 7345 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7346   predicate(UseAVX > 0);
 7347   match(Set dst (OrV src1 src2));
 7348   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7349   ins_encode %{
 7350     int vlen_enc = vector_length_encoding(this);
 7351     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7352   %}
 7353   ins_pipe( pipe_slow );
 7354 %}
 7355 
 7356 instruct vor_mem(vec dst, vec src, memory mem) %{
 7357   predicate((UseAVX > 0) &&
 7358             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7359   match(Set dst (OrV src (LoadVector mem)));
 7360   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7361   ins_encode %{
 7362     int vlen_enc = vector_length_encoding(this);
 7363     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7364   %}
 7365   ins_pipe( pipe_slow );
 7366 %}
 7367 
 7368 // --------------------------------- XOR --------------------------------------
 7369 
 7370 instruct vxor(vec dst, vec src) %{
 7371   predicate(UseAVX == 0);
 7372   match(Set dst (XorV dst src));
 7373   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7374   ins_encode %{
 7375     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7376   %}
 7377   ins_pipe( pipe_slow );
 7378 %}
 7379 
 7380 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7381   predicate(UseAVX > 0);
 7382   match(Set dst (XorV src1 src2));
 7383   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7384   ins_encode %{
 7385     int vlen_enc = vector_length_encoding(this);
 7386     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7387   %}
 7388   ins_pipe( pipe_slow );
 7389 %}
 7390 
 7391 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7392   predicate((UseAVX > 0) &&
 7393             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7394   match(Set dst (XorV src (LoadVector mem)));
 7395   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7396   ins_encode %{
 7397     int vlen_enc = vector_length_encoding(this);
 7398     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7399   %}
 7400   ins_pipe( pipe_slow );
 7401 %}
 7402 
 7403 // --------------------------------- VectorCast --------------------------------------
 7404 
 7405 instruct vcastBtoX(vec dst, vec src) %{
 7406   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7407   match(Set dst (VectorCastB2X src));
 7408   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7409   ins_encode %{
 7410     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7411     int vlen_enc = vector_length_encoding(this);
 7412     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7413   %}
 7414   ins_pipe( pipe_slow );
 7415 %}
 7416 
 7417 instruct vcastBtoD(legVec dst, legVec src) %{
 7418   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7419   match(Set dst (VectorCastB2X src));
 7420   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7421   ins_encode %{
 7422     int vlen_enc = vector_length_encoding(this);
 7423     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7424   %}
 7425   ins_pipe( pipe_slow );
 7426 %}
 7427 
 7428 instruct castStoX(vec dst, vec src) %{
 7429   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7430             Matcher::vector_length(n->in(1)) <= 8 && // src
 7431             Matcher::vector_element_basic_type(n) == T_BYTE);
 7432   match(Set dst (VectorCastS2X src));
 7433   format %{ "vector_cast_s2x $dst,$src" %}
 7434   ins_encode %{
 7435     assert(UseAVX > 0, "required");
 7436 
 7437     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7438     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7439   %}
 7440   ins_pipe( pipe_slow );
 7441 %}
 7442 
 7443 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7444   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7445             Matcher::vector_length(n->in(1)) == 16 && // src
 7446             Matcher::vector_element_basic_type(n) == T_BYTE);
 7447   effect(TEMP dst, TEMP vtmp);
 7448   match(Set dst (VectorCastS2X src));
 7449   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7450   ins_encode %{
 7451     assert(UseAVX > 0, "required");
 7452 
 7453     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7454     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7455     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7456     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7457   %}
 7458   ins_pipe( pipe_slow );
 7459 %}
 7460 
 7461 instruct vcastStoX_evex(vec dst, vec src) %{
 7462   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7463             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7464   match(Set dst (VectorCastS2X src));
 7465   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7466   ins_encode %{
 7467     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7468     int src_vlen_enc = vector_length_encoding(this, $src);
 7469     int vlen_enc = vector_length_encoding(this);
 7470     switch (to_elem_bt) {
 7471       case T_BYTE:
 7472         if (!VM_Version::supports_avx512vl()) {
 7473           vlen_enc = Assembler::AVX_512bit;
 7474         }
 7475         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7476         break;
 7477       case T_INT:
 7478         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7479         break;
 7480       case T_FLOAT:
 7481         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7482         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7483         break;
 7484       case T_LONG:
 7485         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7486         break;
 7487       case T_DOUBLE: {
 7488         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7489         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7490         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7491         break;
 7492       }
 7493       default:
 7494         ShouldNotReachHere();
 7495     }
 7496   %}
 7497   ins_pipe( pipe_slow );
 7498 %}
 7499 
 7500 instruct castItoX(vec dst, vec src) %{
 7501   predicate(UseAVX <= 2 &&
 7502             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7503             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7504   match(Set dst (VectorCastI2X src));
 7505   format %{ "vector_cast_i2x $dst,$src" %}
 7506   ins_encode %{
 7507     assert(UseAVX > 0, "required");
 7508 
 7509     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7510     int vlen_enc = vector_length_encoding(this, $src);
 7511 
 7512     if (to_elem_bt == T_BYTE) {
 7513       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7514       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7515       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7516     } else {
 7517       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7518       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7519       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7520     }
 7521   %}
 7522   ins_pipe( pipe_slow );
 7523 %}
 7524 
 7525 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7526   predicate(UseAVX <= 2 &&
 7527             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7528             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7529   match(Set dst (VectorCastI2X src));
 7530   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7531   effect(TEMP dst, TEMP vtmp);
 7532   ins_encode %{
 7533     assert(UseAVX > 0, "required");
 7534 
 7535     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7536     int vlen_enc = vector_length_encoding(this, $src);
 7537 
 7538     if (to_elem_bt == T_BYTE) {
 7539       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7540       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7541       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7542       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7543     } else {
 7544       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7545       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7546       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7547       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7548     }
 7549   %}
 7550   ins_pipe( pipe_slow );
 7551 %}
 7552 
 7553 instruct vcastItoX_evex(vec dst, vec src) %{
 7554   predicate(UseAVX > 2 ||
 7555             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7556   match(Set dst (VectorCastI2X src));
 7557   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7558   ins_encode %{
 7559     assert(UseAVX > 0, "required");
 7560 
 7561     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7562     int src_vlen_enc = vector_length_encoding(this, $src);
 7563     int dst_vlen_enc = vector_length_encoding(this);
 7564     switch (dst_elem_bt) {
 7565       case T_BYTE:
 7566         if (!VM_Version::supports_avx512vl()) {
 7567           src_vlen_enc = Assembler::AVX_512bit;
 7568         }
 7569         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7570         break;
 7571       case T_SHORT:
 7572         if (!VM_Version::supports_avx512vl()) {
 7573           src_vlen_enc = Assembler::AVX_512bit;
 7574         }
 7575         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7576         break;
 7577       case T_FLOAT:
 7578         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7579         break;
 7580       case T_LONG:
 7581         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7582         break;
 7583       case T_DOUBLE:
 7584         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7585         break;
 7586       default:
 7587         ShouldNotReachHere();
 7588     }
 7589   %}
 7590   ins_pipe( pipe_slow );
 7591 %}
 7592 
 7593 instruct vcastLtoBS(vec dst, vec src) %{
 7594   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7595             UseAVX <= 2);
 7596   match(Set dst (VectorCastL2X src));
 7597   format %{ "vector_cast_l2x  $dst,$src" %}
 7598   ins_encode %{
 7599     assert(UseAVX > 0, "required");
 7600 
 7601     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7602     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7603     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7604                                                       : ExternalAddress(vector_int_to_short_mask());
 7605     if (vlen <= 16) {
 7606       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7607       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7608       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7609     } else {
 7610       assert(vlen <= 32, "required");
 7611       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7612       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7613       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7614       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7615     }
 7616     if (to_elem_bt == T_BYTE) {
 7617       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7618     }
 7619   %}
 7620   ins_pipe( pipe_slow );
 7621 %}
 7622 
 7623 instruct vcastLtoX_evex(vec dst, vec src) %{
 7624   predicate(UseAVX > 2 ||
 7625             (Matcher::vector_element_basic_type(n) == T_INT ||
 7626              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7627              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7628   match(Set dst (VectorCastL2X src));
 7629   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7630   ins_encode %{
 7631     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7632     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7633     int vlen_enc = vector_length_encoding(this, $src);
 7634     switch (to_elem_bt) {
 7635       case T_BYTE:
 7636         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7637           vlen_enc = Assembler::AVX_512bit;
 7638         }
 7639         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7640         break;
 7641       case T_SHORT:
 7642         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7643           vlen_enc = Assembler::AVX_512bit;
 7644         }
 7645         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7646         break;
 7647       case T_INT:
 7648         if (vlen == 8) {
 7649           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7650             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7651           }
 7652         } else if (vlen == 16) {
 7653           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7654         } else if (vlen == 32) {
 7655           if (UseAVX > 2) {
 7656             if (!VM_Version::supports_avx512vl()) {
 7657               vlen_enc = Assembler::AVX_512bit;
 7658             }
 7659             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7660           } else {
 7661             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7662             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7663           }
 7664         } else { // vlen == 64
 7665           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7666         }
 7667         break;
 7668       case T_FLOAT:
 7669         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7670         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7671         break;
 7672       case T_DOUBLE:
 7673         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7674         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7675         break;
 7676 
 7677       default: assert(false, "%s", type2name(to_elem_bt));
 7678     }
 7679   %}
 7680   ins_pipe( pipe_slow );
 7681 %}
 7682 
 7683 instruct vcastFtoD_reg(vec dst, vec src) %{
 7684   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7685   match(Set dst (VectorCastF2X src));
 7686   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7687   ins_encode %{
 7688     int vlen_enc = vector_length_encoding(this);
 7689     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7690   %}
 7691   ins_pipe( pipe_slow );
 7692 %}
 7693 
 7694 
 7695 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7696   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7697             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7698   match(Set dst (VectorCastF2X src));
 7699   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7700   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7701   ins_encode %{
 7702     int vlen_enc = vector_length_encoding(this, $src);
 7703     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7704     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7705     // 32 bit addresses for register indirect addressing mode since stub constants
 7706     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7707     // However, targets are free to increase this limit, but having a large code cache size
 7708     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7709     // cap we save a temporary register allocation which in limiting case can prevent
 7710     // spilling in high register pressure blocks.
 7711     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7712                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7713                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7714   %}
 7715   ins_pipe( pipe_slow );
 7716 %}
 7717 
 7718 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7719   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7720             is_integral_type(Matcher::vector_element_basic_type(n)));
 7721   match(Set dst (VectorCastF2X src));
 7722   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7723   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7724   ins_encode %{
 7725     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7726     if (to_elem_bt == T_LONG) {
 7727       int vlen_enc = vector_length_encoding(this);
 7728       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7729                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7730                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7731     } else {
 7732       int vlen_enc = vector_length_encoding(this, $src);
 7733       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7734                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7735                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7736     }
 7737   %}
 7738   ins_pipe( pipe_slow );
 7739 %}
 7740 
 7741 instruct vcastDtoF_reg(vec dst, vec src) %{
 7742   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7743   match(Set dst (VectorCastD2X src));
 7744   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7745   ins_encode %{
 7746     int vlen_enc = vector_length_encoding(this, $src);
 7747     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7748   %}
 7749   ins_pipe( pipe_slow );
 7750 %}
 7751 
 7752 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7753   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7754             is_integral_type(Matcher::vector_element_basic_type(n)));
 7755   match(Set dst (VectorCastD2X src));
 7756   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7757   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7758   ins_encode %{
 7759     int vlen_enc = vector_length_encoding(this, $src);
 7760     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7761     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7762                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7763                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7764   %}
 7765   ins_pipe( pipe_slow );
 7766 %}
 7767 
 7768 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7769   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7770             is_integral_type(Matcher::vector_element_basic_type(n)));
 7771   match(Set dst (VectorCastD2X src));
 7772   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7773   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7774   ins_encode %{
 7775     int vlen_enc = vector_length_encoding(this, $src);
 7776     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7777     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7778                               ExternalAddress(vector_float_signflip());
 7779     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7780                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7781   %}
 7782   ins_pipe( pipe_slow );
 7783 %}
 7784 
 7785 instruct vucast(vec dst, vec src) %{
 7786   match(Set dst (VectorUCastB2X src));
 7787   match(Set dst (VectorUCastS2X src));
 7788   match(Set dst (VectorUCastI2X src));
 7789   format %{ "vector_ucast $dst,$src\t!" %}
 7790   ins_encode %{
 7791     assert(UseAVX > 0, "required");
 7792 
 7793     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7794     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7795     int vlen_enc = vector_length_encoding(this);
 7796     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7797   %}
 7798   ins_pipe( pipe_slow );
 7799 %}
 7800 
 7801 #ifdef _LP64
 7802 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7803   predicate(!VM_Version::supports_avx512vl() &&
 7804             Matcher::vector_length_in_bytes(n) < 64 &&
 7805             Matcher::vector_element_basic_type(n) == T_INT);
 7806   match(Set dst (RoundVF src));
 7807   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7808   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7809   ins_encode %{
 7810     int vlen_enc = vector_length_encoding(this);
 7811     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7812     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7813                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7814                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7815   %}
 7816   ins_pipe( pipe_slow );
 7817 %}
 7818 
 7819 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7820   predicate((VM_Version::supports_avx512vl() ||
 7821              Matcher::vector_length_in_bytes(n) == 64) &&
 7822              Matcher::vector_element_basic_type(n) == T_INT);
 7823   match(Set dst (RoundVF src));
 7824   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7825   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7826   ins_encode %{
 7827     int vlen_enc = vector_length_encoding(this);
 7828     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7829     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7830                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7831                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7832   %}
 7833   ins_pipe( pipe_slow );
 7834 %}
 7835 
 7836 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7837   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7838   match(Set dst (RoundVD src));
 7839   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7840   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7841   ins_encode %{
 7842     int vlen_enc = vector_length_encoding(this);
 7843     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7844     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7845                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7846                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7847   %}
 7848   ins_pipe( pipe_slow );
 7849 %}
 7850 
 7851 #endif // _LP64
 7852 
 7853 // --------------------------------- VectorMaskCmp --------------------------------------
 7854 
 7855 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7856   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7857             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7858             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7859             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7860   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7861   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7862   ins_encode %{
 7863     int vlen_enc = vector_length_encoding(this, $src1);
 7864     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7865     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7866       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7867     } else {
 7868       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7869     }
 7870   %}
 7871   ins_pipe( pipe_slow );
 7872 %}
 7873 
 7874 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7875   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7876             n->bottom_type()->isa_vectmask() == nullptr &&
 7877             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7878   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7879   effect(TEMP ktmp);
 7880   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7881   ins_encode %{
 7882     int vlen_enc = Assembler::AVX_512bit;
 7883     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7884     KRegister mask = k0; // The comparison itself is not being masked.
 7885     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7886       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7887       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7888     } else {
 7889       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7890       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7891     }
 7892   %}
 7893   ins_pipe( pipe_slow );
 7894 %}
 7895 
 7896 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7897   predicate(n->bottom_type()->isa_vectmask() &&
 7898             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7899   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7900   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7901   ins_encode %{
 7902     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7903     int vlen_enc = vector_length_encoding(this, $src1);
 7904     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7905     KRegister mask = k0; // The comparison itself is not being masked.
 7906     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7907       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7908     } else {
 7909       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7910     }
 7911   %}
 7912   ins_pipe( pipe_slow );
 7913 %}
 7914 
 7915 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7916   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7917             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7918             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7919             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7920             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7921             (n->in(2)->get_int() == BoolTest::eq ||
 7922              n->in(2)->get_int() == BoolTest::lt ||
 7923              n->in(2)->get_int() == BoolTest::gt)); // cond
 7924   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7925   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7926   ins_encode %{
 7927     int vlen_enc = vector_length_encoding(this, $src1);
 7928     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7929     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7930     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7931   %}
 7932   ins_pipe( pipe_slow );
 7933 %}
 7934 
 7935 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7936   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7937             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7938             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7939             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7940             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7941             (n->in(2)->get_int() == BoolTest::ne ||
 7942              n->in(2)->get_int() == BoolTest::le ||
 7943              n->in(2)->get_int() == BoolTest::ge)); // cond
 7944   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7945   effect(TEMP dst, TEMP xtmp);
 7946   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7947   ins_encode %{
 7948     int vlen_enc = vector_length_encoding(this, $src1);
 7949     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7950     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7951     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7952   %}
 7953   ins_pipe( pipe_slow );
 7954 %}
 7955 
 7956 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7957   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7958             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7959             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7960             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7961             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7962   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7963   effect(TEMP dst, TEMP xtmp);
 7964   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7965   ins_encode %{
 7966     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7967     int vlen_enc = vector_length_encoding(this, $src1);
 7968     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7969     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7970 
 7971     if (vlen_enc == Assembler::AVX_128bit) {
 7972       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7973     } else {
 7974       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7975     }
 7976     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7977     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7978     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7979   %}
 7980   ins_pipe( pipe_slow );
 7981 %}
 7982 
 7983 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7984   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7985              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7986              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7987   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7988   effect(TEMP ktmp);
 7989   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7990   ins_encode %{
 7991     assert(UseAVX > 2, "required");
 7992 
 7993     int vlen_enc = vector_length_encoding(this, $src1);
 7994     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7995     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7996     KRegister mask = k0; // The comparison itself is not being masked.
 7997     bool merge = false;
 7998     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7999 
 8000     switch (src1_elem_bt) {
 8001       case T_INT: {
 8002         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8003         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8004         break;
 8005       }
 8006       case T_LONG: {
 8007         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8008         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8009         break;
 8010       }
 8011       default: assert(false, "%s", type2name(src1_elem_bt));
 8012     }
 8013   %}
 8014   ins_pipe( pipe_slow );
 8015 %}
 8016 
 8017 
 8018 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8019   predicate(n->bottom_type()->isa_vectmask() &&
 8020             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8021   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8022   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8023   ins_encode %{
 8024     assert(UseAVX > 2, "required");
 8025     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8026 
 8027     int vlen_enc = vector_length_encoding(this, $src1);
 8028     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8029     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8030     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8031 
 8032     // Comparison i
 8033     switch (src1_elem_bt) {
 8034       case T_BYTE: {
 8035         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8036         break;
 8037       }
 8038       case T_SHORT: {
 8039         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8040         break;
 8041       }
 8042       case T_INT: {
 8043         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8044         break;
 8045       }
 8046       case T_LONG: {
 8047         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8048         break;
 8049       }
 8050       default: assert(false, "%s", type2name(src1_elem_bt));
 8051     }
 8052   %}
 8053   ins_pipe( pipe_slow );
 8054 %}
 8055 
 8056 // Extract
 8057 
 8058 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8059   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8060   match(Set dst (ExtractI src idx));
 8061   match(Set dst (ExtractS src idx));
 8062 #ifdef _LP64
 8063   match(Set dst (ExtractB src idx));
 8064 #endif
 8065   format %{ "extractI $dst,$src,$idx\t!" %}
 8066   ins_encode %{
 8067     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8068 
 8069     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8070     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8071   %}
 8072   ins_pipe( pipe_slow );
 8073 %}
 8074 
 8075 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8076   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8077             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8078   match(Set dst (ExtractI src idx));
 8079   match(Set dst (ExtractS src idx));
 8080 #ifdef _LP64
 8081   match(Set dst (ExtractB src idx));
 8082 #endif
 8083   effect(TEMP vtmp);
 8084   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8085   ins_encode %{
 8086     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8087 
 8088     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8089     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8090     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8091   %}
 8092   ins_pipe( pipe_slow );
 8093 %}
 8094 
 8095 #ifdef _LP64
 8096 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8097   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8098   match(Set dst (ExtractL src idx));
 8099   format %{ "extractL $dst,$src,$idx\t!" %}
 8100   ins_encode %{
 8101     assert(UseSSE >= 4, "required");
 8102     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8103 
 8104     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8105   %}
 8106   ins_pipe( pipe_slow );
 8107 %}
 8108 
 8109 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8110   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8111             Matcher::vector_length(n->in(1)) == 8);  // src
 8112   match(Set dst (ExtractL src idx));
 8113   effect(TEMP vtmp);
 8114   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8115   ins_encode %{
 8116     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8117 
 8118     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8119     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8120   %}
 8121   ins_pipe( pipe_slow );
 8122 %}
 8123 #endif
 8124 
 8125 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8126   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8127   match(Set dst (ExtractF src idx));
 8128   effect(TEMP dst, TEMP vtmp);
 8129   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8130   ins_encode %{
 8131     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8132 
 8133     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8134   %}
 8135   ins_pipe( pipe_slow );
 8136 %}
 8137 
 8138 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8139   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8140             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8141   match(Set dst (ExtractF src idx));
 8142   effect(TEMP vtmp);
 8143   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8144   ins_encode %{
 8145     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8146 
 8147     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8148     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8149   %}
 8150   ins_pipe( pipe_slow );
 8151 %}
 8152 
 8153 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8154   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8155   match(Set dst (ExtractD src idx));
 8156   format %{ "extractD $dst,$src,$idx\t!" %}
 8157   ins_encode %{
 8158     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8159 
 8160     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8161   %}
 8162   ins_pipe( pipe_slow );
 8163 %}
 8164 
 8165 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8166   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8167             Matcher::vector_length(n->in(1)) == 8);  // src
 8168   match(Set dst (ExtractD src idx));
 8169   effect(TEMP vtmp);
 8170   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8171   ins_encode %{
 8172     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8173 
 8174     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8175     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8176   %}
 8177   ins_pipe( pipe_slow );
 8178 %}
 8179 
 8180 // --------------------------------- Vector Blend --------------------------------------
 8181 
 8182 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8183   predicate(UseAVX == 0);
 8184   match(Set dst (VectorBlend (Binary dst src) mask));
 8185   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8186   effect(TEMP tmp);
 8187   ins_encode %{
 8188     assert(UseSSE >= 4, "required");
 8189 
 8190     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8191       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8192     }
 8193     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8194   %}
 8195   ins_pipe( pipe_slow );
 8196 %}
 8197 
 8198 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8199   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8200             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8201             Matcher::vector_length_in_bytes(n) <= 32 &&
 8202             is_integral_type(Matcher::vector_element_basic_type(n)));
 8203   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8204   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8205   ins_encode %{
 8206     int vlen_enc = vector_length_encoding(this);
 8207     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8208   %}
 8209   ins_pipe( pipe_slow );
 8210 %}
 8211 
 8212 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8213   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8214             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8215             Matcher::vector_length_in_bytes(n) <= 32 &&
 8216             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8217   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8218   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8219   ins_encode %{
 8220     int vlen_enc = vector_length_encoding(this);
 8221     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8222   %}
 8223   ins_pipe( pipe_slow );
 8224 %}
 8225 
 8226 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8227   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8228             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8229             Matcher::vector_length_in_bytes(n) <= 32);
 8230   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8231   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8232   effect(TEMP vtmp, TEMP dst);
 8233   ins_encode %{
 8234     int vlen_enc = vector_length_encoding(this);
 8235     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8236     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8237     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8238   %}
 8239   ins_pipe( pipe_slow );
 8240 %}
 8241 
 8242 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8243   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8244             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8245   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8246   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8247   effect(TEMP ktmp);
 8248   ins_encode %{
 8249      int vlen_enc = Assembler::AVX_512bit;
 8250      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8251     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8252     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8253   %}
 8254   ins_pipe( pipe_slow );
 8255 %}
 8256 
 8257 
 8258 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8259   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8260             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8261              VM_Version::supports_avx512bw()));
 8262   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8263   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8264   ins_encode %{
 8265     int vlen_enc = vector_length_encoding(this);
 8266     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8267     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8268   %}
 8269   ins_pipe( pipe_slow );
 8270 %}
 8271 
 8272 // --------------------------------- ABS --------------------------------------
 8273 // a = |a|
 8274 instruct vabsB_reg(vec dst, vec src) %{
 8275   match(Set dst (AbsVB  src));
 8276   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8277   ins_encode %{
 8278     uint vlen = Matcher::vector_length(this);
 8279     if (vlen <= 16) {
 8280       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8281     } else {
 8282       int vlen_enc = vector_length_encoding(this);
 8283       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8284     }
 8285   %}
 8286   ins_pipe( pipe_slow );
 8287 %}
 8288 
 8289 instruct vabsS_reg(vec dst, vec src) %{
 8290   match(Set dst (AbsVS  src));
 8291   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8292   ins_encode %{
 8293     uint vlen = Matcher::vector_length(this);
 8294     if (vlen <= 8) {
 8295       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8296     } else {
 8297       int vlen_enc = vector_length_encoding(this);
 8298       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8299     }
 8300   %}
 8301   ins_pipe( pipe_slow );
 8302 %}
 8303 
 8304 instruct vabsI_reg(vec dst, vec src) %{
 8305   match(Set dst (AbsVI  src));
 8306   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8307   ins_encode %{
 8308     uint vlen = Matcher::vector_length(this);
 8309     if (vlen <= 4) {
 8310       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8311     } else {
 8312       int vlen_enc = vector_length_encoding(this);
 8313       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8314     }
 8315   %}
 8316   ins_pipe( pipe_slow );
 8317 %}
 8318 
 8319 instruct vabsL_reg(vec dst, vec src) %{
 8320   match(Set dst (AbsVL  src));
 8321   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8322   ins_encode %{
 8323     assert(UseAVX > 2, "required");
 8324     int vlen_enc = vector_length_encoding(this);
 8325     if (!VM_Version::supports_avx512vl()) {
 8326       vlen_enc = Assembler::AVX_512bit;
 8327     }
 8328     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8329   %}
 8330   ins_pipe( pipe_slow );
 8331 %}
 8332 
 8333 // --------------------------------- ABSNEG --------------------------------------
 8334 
 8335 instruct vabsnegF(vec dst, vec src) %{
 8336   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8337   match(Set dst (AbsVF src));
 8338   match(Set dst (NegVF src));
 8339   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8340   ins_cost(150);
 8341   ins_encode %{
 8342     int opcode = this->ideal_Opcode();
 8343     int vlen = Matcher::vector_length(this);
 8344     if (vlen == 2) {
 8345       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8346     } else {
 8347       assert(vlen == 8 || vlen == 16, "required");
 8348       int vlen_enc = vector_length_encoding(this);
 8349       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8350     }
 8351   %}
 8352   ins_pipe( pipe_slow );
 8353 %}
 8354 
 8355 instruct vabsneg4F(vec dst) %{
 8356   predicate(Matcher::vector_length(n) == 4);
 8357   match(Set dst (AbsVF dst));
 8358   match(Set dst (NegVF dst));
 8359   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8360   ins_cost(150);
 8361   ins_encode %{
 8362     int opcode = this->ideal_Opcode();
 8363     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8364   %}
 8365   ins_pipe( pipe_slow );
 8366 %}
 8367 
 8368 instruct vabsnegD(vec dst, vec src) %{
 8369   match(Set dst (AbsVD  src));
 8370   match(Set dst (NegVD  src));
 8371   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8372   ins_encode %{
 8373     int opcode = this->ideal_Opcode();
 8374     uint vlen = Matcher::vector_length(this);
 8375     if (vlen == 2) {
 8376       assert(UseSSE >= 2, "required");
 8377       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8378     } else {
 8379       int vlen_enc = vector_length_encoding(this);
 8380       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8381     }
 8382   %}
 8383   ins_pipe( pipe_slow );
 8384 %}
 8385 
 8386 //------------------------------------- VectorTest --------------------------------------------
 8387 
 8388 #ifdef _LP64
 8389 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8390   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8391   match(Set cr (VectorTest src1 src2));
 8392   effect(TEMP vtmp);
 8393   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8394   ins_encode %{
 8395     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8396     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8397     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8398   %}
 8399   ins_pipe( pipe_slow );
 8400 %}
 8401 
 8402 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8403   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8404   match(Set cr (VectorTest src1 src2));
 8405   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8406   ins_encode %{
 8407     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8408     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8409     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8410   %}
 8411   ins_pipe( pipe_slow );
 8412 %}
 8413 
 8414 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8415   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8416              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8417             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8418   match(Set cr (VectorTest src1 src2));
 8419   effect(TEMP tmp);
 8420   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8421   ins_encode %{
 8422     uint masklen = Matcher::vector_length(this, $src1);
 8423     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8424     __ andl($tmp$$Register, (1 << masklen) - 1);
 8425     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8426   %}
 8427   ins_pipe( pipe_slow );
 8428 %}
 8429 
 8430 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8431   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8432              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8433             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8434   match(Set cr (VectorTest src1 src2));
 8435   effect(TEMP tmp);
 8436   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8437   ins_encode %{
 8438     uint masklen = Matcher::vector_length(this, $src1);
 8439     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8440     __ andl($tmp$$Register, (1 << masklen) - 1);
 8441   %}
 8442   ins_pipe( pipe_slow );
 8443 %}
 8444 
 8445 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8446   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8447             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8448   match(Set cr (VectorTest src1 src2));
 8449   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8450   ins_encode %{
 8451     uint masklen = Matcher::vector_length(this, $src1);
 8452     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8453   %}
 8454   ins_pipe( pipe_slow );
 8455 %}
 8456 #endif
 8457 
 8458 //------------------------------------- LoadMask --------------------------------------------
 8459 
 8460 instruct loadMask(legVec dst, legVec src) %{
 8461   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8462   match(Set dst (VectorLoadMask src));
 8463   effect(TEMP dst);
 8464   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8465   ins_encode %{
 8466     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8467     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8468     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8469   %}
 8470   ins_pipe( pipe_slow );
 8471 %}
 8472 
 8473 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8474   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8475   match(Set dst (VectorLoadMask src));
 8476   effect(TEMP xtmp);
 8477   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8478   ins_encode %{
 8479     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8480                         true, Assembler::AVX_512bit);
 8481   %}
 8482   ins_pipe( pipe_slow );
 8483 %}
 8484 
 8485 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8486   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8487   match(Set dst (VectorLoadMask src));
 8488   effect(TEMP xtmp);
 8489   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8490   ins_encode %{
 8491     int vlen_enc = vector_length_encoding(in(1));
 8492     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8493                         false, vlen_enc);
 8494   %}
 8495   ins_pipe( pipe_slow );
 8496 %}
 8497 
 8498 //------------------------------------- StoreMask --------------------------------------------
 8499 
 8500 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8501   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8502   match(Set dst (VectorStoreMask src size));
 8503   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8504   ins_encode %{
 8505     int vlen = Matcher::vector_length(this);
 8506     if (vlen <= 16 && UseAVX <= 2) {
 8507       assert(UseSSE >= 3, "required");
 8508       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8509     } else {
 8510       assert(UseAVX > 0, "required");
 8511       int src_vlen_enc = vector_length_encoding(this, $src);
 8512       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8513     }
 8514   %}
 8515   ins_pipe( pipe_slow );
 8516 %}
 8517 
 8518 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8519   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8520   match(Set dst (VectorStoreMask src size));
 8521   effect(TEMP_DEF dst, TEMP xtmp);
 8522   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8523   ins_encode %{
 8524     int vlen_enc = Assembler::AVX_128bit;
 8525     int vlen = Matcher::vector_length(this);
 8526     if (vlen <= 8) {
 8527       assert(UseSSE >= 3, "required");
 8528       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8529       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8530       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8531     } else {
 8532       assert(UseAVX > 0, "required");
 8533       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8534       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8535       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8536     }
 8537   %}
 8538   ins_pipe( pipe_slow );
 8539 %}
 8540 
 8541 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8542   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8543   match(Set dst (VectorStoreMask src size));
 8544   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8545   effect(TEMP_DEF dst, TEMP xtmp);
 8546   ins_encode %{
 8547     int vlen_enc = Assembler::AVX_128bit;
 8548     int vlen = Matcher::vector_length(this);
 8549     if (vlen <= 4) {
 8550       assert(UseSSE >= 3, "required");
 8551       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8552       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8553       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8554       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8555     } else {
 8556       assert(UseAVX > 0, "required");
 8557       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8558       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8559       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8560       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8561       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8562     }
 8563   %}
 8564   ins_pipe( pipe_slow );
 8565 %}
 8566 
 8567 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8568   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8569   match(Set dst (VectorStoreMask src size));
 8570   effect(TEMP_DEF dst, TEMP xtmp);
 8571   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8572   ins_encode %{
 8573     assert(UseSSE >= 3, "required");
 8574     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8575     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8576     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8577     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8578     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8579   %}
 8580   ins_pipe( pipe_slow );
 8581 %}
 8582 
 8583 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8584   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8585   match(Set dst (VectorStoreMask src size));
 8586   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8587   effect(TEMP_DEF dst, TEMP vtmp);
 8588   ins_encode %{
 8589     int vlen_enc = Assembler::AVX_128bit;
 8590     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8591     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8592     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8593     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8594     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8595     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8596     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8597   %}
 8598   ins_pipe( pipe_slow );
 8599 %}
 8600 
 8601 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8602   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8603   match(Set dst (VectorStoreMask src size));
 8604   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8605   ins_encode %{
 8606     int src_vlen_enc = vector_length_encoding(this, $src);
 8607     int dst_vlen_enc = vector_length_encoding(this);
 8608     if (!VM_Version::supports_avx512vl()) {
 8609       src_vlen_enc = Assembler::AVX_512bit;
 8610     }
 8611     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8612     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8613   %}
 8614   ins_pipe( pipe_slow );
 8615 %}
 8616 
 8617 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8618   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8619   match(Set dst (VectorStoreMask src size));
 8620   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8621   ins_encode %{
 8622     int src_vlen_enc = vector_length_encoding(this, $src);
 8623     int dst_vlen_enc = vector_length_encoding(this);
 8624     if (!VM_Version::supports_avx512vl()) {
 8625       src_vlen_enc = Assembler::AVX_512bit;
 8626     }
 8627     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8628     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8629   %}
 8630   ins_pipe( pipe_slow );
 8631 %}
 8632 
 8633 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8634   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8635   match(Set dst (VectorStoreMask mask size));
 8636   effect(TEMP_DEF dst);
 8637   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8638   ins_encode %{
 8639     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8640     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8641                  false, Assembler::AVX_512bit, noreg);
 8642     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8643   %}
 8644   ins_pipe( pipe_slow );
 8645 %}
 8646 
 8647 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8648   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8649   match(Set dst (VectorStoreMask mask size));
 8650   effect(TEMP_DEF dst);
 8651   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8652   ins_encode %{
 8653     int dst_vlen_enc = vector_length_encoding(this);
 8654     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8655     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8656   %}
 8657   ins_pipe( pipe_slow );
 8658 %}
 8659 
 8660 instruct vmaskcast_evex(kReg dst) %{
 8661   match(Set dst (VectorMaskCast dst));
 8662   ins_cost(0);
 8663   format %{ "vector_mask_cast $dst" %}
 8664   ins_encode %{
 8665     // empty
 8666   %}
 8667   ins_pipe(empty);
 8668 %}
 8669 
 8670 instruct vmaskcast(vec dst) %{
 8671   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8672   match(Set dst (VectorMaskCast dst));
 8673   ins_cost(0);
 8674   format %{ "vector_mask_cast $dst" %}
 8675   ins_encode %{
 8676     // empty
 8677   %}
 8678   ins_pipe(empty);
 8679 %}
 8680 
 8681 instruct vmaskcast_avx(vec dst, vec src) %{
 8682   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8683   match(Set dst (VectorMaskCast src));
 8684   format %{ "vector_mask_cast $dst, $src" %}
 8685   ins_encode %{
 8686     int vlen = Matcher::vector_length(this);
 8687     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8688     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8689     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8690   %}
 8691   ins_pipe(pipe_slow);
 8692 %}
 8693 
 8694 //-------------------------------- Load Iota Indices ----------------------------------
 8695 
 8696 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8697   match(Set dst (VectorLoadConst src));
 8698   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8699   ins_encode %{
 8700      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8701      BasicType bt = Matcher::vector_element_basic_type(this);
 8702      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8703   %}
 8704   ins_pipe( pipe_slow );
 8705 %}
 8706 
 8707 #ifdef _LP64
 8708 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8709   match(Set dst (PopulateIndex src1 src2));
 8710   effect(TEMP dst, TEMP vtmp);
 8711   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8712   ins_encode %{
 8713      assert($src2$$constant == 1, "required");
 8714      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8715      int vlen_enc = vector_length_encoding(this);
 8716      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8717      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8718      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8719      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8720   %}
 8721   ins_pipe( pipe_slow );
 8722 %}
 8723 
 8724 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8725   match(Set dst (PopulateIndex src1 src2));
 8726   effect(TEMP dst, TEMP vtmp);
 8727   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8728   ins_encode %{
 8729      assert($src2$$constant == 1, "required");
 8730      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8731      int vlen_enc = vector_length_encoding(this);
 8732      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8733      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8734      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8735      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8736   %}
 8737   ins_pipe( pipe_slow );
 8738 %}
 8739 #endif
 8740 //-------------------------------- Rearrange ----------------------------------
 8741 
 8742 // LoadShuffle/Rearrange for Byte
 8743 
 8744 instruct loadShuffleB(vec dst) %{
 8745   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8746   match(Set dst (VectorLoadShuffle dst));
 8747   format %{ "vector_load_shuffle $dst, $dst" %}
 8748   ins_encode %{
 8749     // empty
 8750   %}
 8751   ins_pipe( pipe_slow );
 8752 %}
 8753 
 8754 instruct rearrangeB(vec dst, vec shuffle) %{
 8755   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8756             Matcher::vector_length(n) < 32);
 8757   match(Set dst (VectorRearrange dst shuffle));
 8758   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8759   ins_encode %{
 8760     assert(UseSSE >= 4, "required");
 8761     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8762   %}
 8763   ins_pipe( pipe_slow );
 8764 %}
 8765 
 8766 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8767   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8768             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8769   match(Set dst (VectorRearrange src shuffle));
 8770   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8771   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8772   ins_encode %{
 8773     assert(UseAVX >= 2, "required");
 8774     // Swap src into vtmp1
 8775     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8776     // Shuffle swapped src to get entries from other 128 bit lane
 8777     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8778     // Shuffle original src to get entries from self 128 bit lane
 8779     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8780     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8781     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8782     // Perform the blend
 8783     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8784   %}
 8785   ins_pipe( pipe_slow );
 8786 %}
 8787 
 8788 
 8789 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8790   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8791             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8792   match(Set dst (VectorRearrange src shuffle));
 8793   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8794   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8795   ins_encode %{
 8796     int vlen_enc = vector_length_encoding(this);
 8797     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8798                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8799                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8800   %}
 8801   ins_pipe( pipe_slow );
 8802 %}
 8803 
 8804 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8805   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8806             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8807   match(Set dst (VectorRearrange src shuffle));
 8808   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8809   ins_encode %{
 8810     int vlen_enc = vector_length_encoding(this);
 8811     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8812   %}
 8813   ins_pipe( pipe_slow );
 8814 %}
 8815 
 8816 // LoadShuffle/Rearrange for Short
 8817 
 8818 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8819   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8820             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8821   match(Set dst (VectorLoadShuffle src));
 8822   effect(TEMP dst, TEMP vtmp);
 8823   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8824   ins_encode %{
 8825     // Create a byte shuffle mask from short shuffle mask
 8826     // only byte shuffle instruction available on these platforms
 8827     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8828     if (UseAVX == 0) {
 8829       assert(vlen_in_bytes <= 16, "required");
 8830       // Multiply each shuffle by two to get byte index
 8831       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8832       __ psllw($vtmp$$XMMRegister, 1);
 8833 
 8834       // Duplicate to create 2 copies of byte index
 8835       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8836       __ psllw($dst$$XMMRegister, 8);
 8837       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8838 
 8839       // Add one to get alternate byte index
 8840       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8841       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8842     } else {
 8843       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8844       int vlen_enc = vector_length_encoding(this);
 8845       // Multiply each shuffle by two to get byte index
 8846       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8847       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8848 
 8849       // Duplicate to create 2 copies of byte index
 8850       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8851       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8852 
 8853       // Add one to get alternate byte index
 8854       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8855     }
 8856   %}
 8857   ins_pipe( pipe_slow );
 8858 %}
 8859 
 8860 instruct rearrangeS(vec dst, vec shuffle) %{
 8861   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8862             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8863   match(Set dst (VectorRearrange dst shuffle));
 8864   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8865   ins_encode %{
 8866     assert(UseSSE >= 4, "required");
 8867     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8868   %}
 8869   ins_pipe( pipe_slow );
 8870 %}
 8871 
 8872 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8873   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8874             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8875   match(Set dst (VectorRearrange src shuffle));
 8876   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8877   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8878   ins_encode %{
 8879     assert(UseAVX >= 2, "required");
 8880     // Swap src into vtmp1
 8881     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8882     // Shuffle swapped src to get entries from other 128 bit lane
 8883     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8884     // Shuffle original src to get entries from self 128 bit lane
 8885     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8886     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8887     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8888     // Perform the blend
 8889     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8890   %}
 8891   ins_pipe( pipe_slow );
 8892 %}
 8893 
 8894 instruct loadShuffleS_evex(vec dst, vec src) %{
 8895   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8896             VM_Version::supports_avx512bw());
 8897   match(Set dst (VectorLoadShuffle src));
 8898   format %{ "vector_load_shuffle $dst, $src" %}
 8899   ins_encode %{
 8900     int vlen_enc = vector_length_encoding(this);
 8901     if (!VM_Version::supports_avx512vl()) {
 8902       vlen_enc = Assembler::AVX_512bit;
 8903     }
 8904     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8905   %}
 8906   ins_pipe( pipe_slow );
 8907 %}
 8908 
 8909 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8910   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8911             VM_Version::supports_avx512bw());
 8912   match(Set dst (VectorRearrange src shuffle));
 8913   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8914   ins_encode %{
 8915     int vlen_enc = vector_length_encoding(this);
 8916     if (!VM_Version::supports_avx512vl()) {
 8917       vlen_enc = Assembler::AVX_512bit;
 8918     }
 8919     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8920   %}
 8921   ins_pipe( pipe_slow );
 8922 %}
 8923 
 8924 // LoadShuffle/Rearrange for Integer and Float
 8925 
 8926 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8927   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8928             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8929   match(Set dst (VectorLoadShuffle src));
 8930   effect(TEMP dst, TEMP vtmp);
 8931   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8932   ins_encode %{
 8933     assert(UseSSE >= 4, "required");
 8934 
 8935     // Create a byte shuffle mask from int shuffle mask
 8936     // only byte shuffle instruction available on these platforms
 8937 
 8938     // Duplicate and multiply each shuffle by 4
 8939     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8940     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8941     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8942     __ psllw($vtmp$$XMMRegister, 2);
 8943 
 8944     // Duplicate again to create 4 copies of byte index
 8945     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8946     __ psllw($dst$$XMMRegister, 8);
 8947     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8948 
 8949     // Add 3,2,1,0 to get alternate byte index
 8950     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8951     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8952   %}
 8953   ins_pipe( pipe_slow );
 8954 %}
 8955 
 8956 instruct rearrangeI(vec dst, vec shuffle) %{
 8957   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8958             UseAVX == 0);
 8959   match(Set dst (VectorRearrange dst shuffle));
 8960   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8961   ins_encode %{
 8962     assert(UseSSE >= 4, "required");
 8963     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8964   %}
 8965   ins_pipe( pipe_slow );
 8966 %}
 8967 
 8968 instruct loadShuffleI_avx(vec dst, vec src) %{
 8969   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8970             UseAVX > 0);
 8971   match(Set dst (VectorLoadShuffle src));
 8972   format %{ "vector_load_shuffle $dst, $src" %}
 8973   ins_encode %{
 8974     int vlen_enc = vector_length_encoding(this);
 8975     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8976   %}
 8977   ins_pipe( pipe_slow );
 8978 %}
 8979 
 8980 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8981   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8982             UseAVX > 0);
 8983   match(Set dst (VectorRearrange src shuffle));
 8984   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8985   ins_encode %{
 8986     int vlen_enc = vector_length_encoding(this);
 8987     BasicType bt = Matcher::vector_element_basic_type(this);
 8988     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8989   %}
 8990   ins_pipe( pipe_slow );
 8991 %}
 8992 
 8993 // LoadShuffle/Rearrange for Long and Double
 8994 
 8995 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8996   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8997             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8998   match(Set dst (VectorLoadShuffle src));
 8999   effect(TEMP dst, TEMP vtmp);
 9000   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9001   ins_encode %{
 9002     assert(UseAVX >= 2, "required");
 9003 
 9004     int vlen_enc = vector_length_encoding(this);
 9005     // Create a double word shuffle mask from long shuffle mask
 9006     // only double word shuffle instruction available on these platforms
 9007 
 9008     // Multiply each shuffle by two to get double word index
 9009     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9010     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 9011 
 9012     // Duplicate each double word shuffle
 9013     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9014     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9015 
 9016     // Add one to get alternate double word index
 9017     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9018   %}
 9019   ins_pipe( pipe_slow );
 9020 %}
 9021 
 9022 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9023   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9024             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9025   match(Set dst (VectorRearrange src shuffle));
 9026   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9027   ins_encode %{
 9028     assert(UseAVX >= 2, "required");
 9029 
 9030     int vlen_enc = vector_length_encoding(this);
 9031     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9032   %}
 9033   ins_pipe( pipe_slow );
 9034 %}
 9035 
 9036 instruct loadShuffleL_evex(vec dst, vec src) %{
 9037   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9038             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9039   match(Set dst (VectorLoadShuffle src));
 9040   format %{ "vector_load_shuffle $dst, $src" %}
 9041   ins_encode %{
 9042     assert(UseAVX > 2, "required");
 9043 
 9044     int vlen_enc = vector_length_encoding(this);
 9045     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9046   %}
 9047   ins_pipe( pipe_slow );
 9048 %}
 9049 
 9050 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9051   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9052             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9053   match(Set dst (VectorRearrange src shuffle));
 9054   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9055   ins_encode %{
 9056     assert(UseAVX > 2, "required");
 9057 
 9058     int vlen_enc = vector_length_encoding(this);
 9059     if (vlen_enc == Assembler::AVX_128bit) {
 9060       vlen_enc = Assembler::AVX_256bit;
 9061     }
 9062     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9063   %}
 9064   ins_pipe( pipe_slow );
 9065 %}
 9066 
 9067 // --------------------------------- FMA --------------------------------------
 9068 // a * b + c
 9069 
 9070 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9071   match(Set c (FmaVF  c (Binary a b)));
 9072   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9073   ins_cost(150);
 9074   ins_encode %{
 9075     assert(UseFMA, "not enabled");
 9076     int vlen_enc = vector_length_encoding(this);
 9077     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9078   %}
 9079   ins_pipe( pipe_slow );
 9080 %}
 9081 
 9082 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9083   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9084   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9085   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9086   ins_cost(150);
 9087   ins_encode %{
 9088     assert(UseFMA, "not enabled");
 9089     int vlen_enc = vector_length_encoding(this);
 9090     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9091   %}
 9092   ins_pipe( pipe_slow );
 9093 %}
 9094 
 9095 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9096   match(Set c (FmaVD  c (Binary a b)));
 9097   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9098   ins_cost(150);
 9099   ins_encode %{
 9100     assert(UseFMA, "not enabled");
 9101     int vlen_enc = vector_length_encoding(this);
 9102     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9103   %}
 9104   ins_pipe( pipe_slow );
 9105 %}
 9106 
 9107 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9108   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9109   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9110   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9111   ins_cost(150);
 9112   ins_encode %{
 9113     assert(UseFMA, "not enabled");
 9114     int vlen_enc = vector_length_encoding(this);
 9115     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9116   %}
 9117   ins_pipe( pipe_slow );
 9118 %}
 9119 
 9120 // --------------------------------- Vector Multiply Add --------------------------------------
 9121 
 9122 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9123   predicate(UseAVX == 0);
 9124   match(Set dst (MulAddVS2VI dst src1));
 9125   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9126   ins_encode %{
 9127     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9128   %}
 9129   ins_pipe( pipe_slow );
 9130 %}
 9131 
 9132 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9133   predicate(UseAVX > 0);
 9134   match(Set dst (MulAddVS2VI src1 src2));
 9135   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9136   ins_encode %{
 9137     int vlen_enc = vector_length_encoding(this);
 9138     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9139   %}
 9140   ins_pipe( pipe_slow );
 9141 %}
 9142 
 9143 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9144 
 9145 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9146   predicate(VM_Version::supports_avx512_vnni());
 9147   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9148   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9149   ins_encode %{
 9150     assert(UseAVX > 2, "required");
 9151     int vlen_enc = vector_length_encoding(this);
 9152     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9153   %}
 9154   ins_pipe( pipe_slow );
 9155   ins_cost(10);
 9156 %}
 9157 
 9158 // --------------------------------- PopCount --------------------------------------
 9159 
 9160 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9161   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9162   match(Set dst (PopCountVI src));
 9163   match(Set dst (PopCountVL src));
 9164   format %{ "vector_popcount_integral $dst, $src" %}
 9165   ins_encode %{
 9166     int opcode = this->ideal_Opcode();
 9167     int vlen_enc = vector_length_encoding(this, $src);
 9168     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9169     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9170   %}
 9171   ins_pipe( pipe_slow );
 9172 %}
 9173 
 9174 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9175   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9176   match(Set dst (PopCountVI src mask));
 9177   match(Set dst (PopCountVL src mask));
 9178   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9179   ins_encode %{
 9180     int vlen_enc = vector_length_encoding(this, $src);
 9181     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9182     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9183     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9184   %}
 9185   ins_pipe( pipe_slow );
 9186 %}
 9187 
 9188 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9189   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9190   match(Set dst (PopCountVI src));
 9191   match(Set dst (PopCountVL src));
 9192   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9193   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9194   ins_encode %{
 9195     int opcode = this->ideal_Opcode();
 9196     int vlen_enc = vector_length_encoding(this, $src);
 9197     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9198     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9199                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9200   %}
 9201   ins_pipe( pipe_slow );
 9202 %}
 9203 
 9204 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9205 
 9206 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9207   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9208                                               Matcher::vector_length_in_bytes(n->in(1))));
 9209   match(Set dst (CountTrailingZerosV src));
 9210   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9211   ins_cost(400);
 9212   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9213   ins_encode %{
 9214     int vlen_enc = vector_length_encoding(this, $src);
 9215     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9216     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9217                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9218   %}
 9219   ins_pipe( pipe_slow );
 9220 %}
 9221 
 9222 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9223   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9224             VM_Version::supports_avx512cd() &&
 9225             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9226   match(Set dst (CountTrailingZerosV src));
 9227   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9228   ins_cost(400);
 9229   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9230   ins_encode %{
 9231     int vlen_enc = vector_length_encoding(this, $src);
 9232     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9233     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9234                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9235   %}
 9236   ins_pipe( pipe_slow );
 9237 %}
 9238 
 9239 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9240   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9241   match(Set dst (CountTrailingZerosV src));
 9242   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9243   ins_cost(400);
 9244   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9245   ins_encode %{
 9246     int vlen_enc = vector_length_encoding(this, $src);
 9247     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9248     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9249                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9250                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9251   %}
 9252   ins_pipe( pipe_slow );
 9253 %}
 9254 
 9255 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9256   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9257   match(Set dst (CountTrailingZerosV src));
 9258   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9259   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9260   ins_encode %{
 9261     int vlen_enc = vector_length_encoding(this, $src);
 9262     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9263     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9264                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9265   %}
 9266   ins_pipe( pipe_slow );
 9267 %}
 9268 
 9269 
 9270 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9271 
 9272 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9273   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9274   effect(TEMP dst);
 9275   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9276   ins_encode %{
 9277     int vector_len = vector_length_encoding(this);
 9278     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9279   %}
 9280   ins_pipe( pipe_slow );
 9281 %}
 9282 
 9283 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9284   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9285   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9286   effect(TEMP dst);
 9287   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9288   ins_encode %{
 9289     int vector_len = vector_length_encoding(this);
 9290     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9291   %}
 9292   ins_pipe( pipe_slow );
 9293 %}
 9294 
 9295 // --------------------------------- Rotation Operations ----------------------------------
 9296 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9297   match(Set dst (RotateLeftV src shift));
 9298   match(Set dst (RotateRightV src shift));
 9299   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9300   ins_encode %{
 9301     int opcode      = this->ideal_Opcode();
 9302     int vector_len  = vector_length_encoding(this);
 9303     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9304     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9305   %}
 9306   ins_pipe( pipe_slow );
 9307 %}
 9308 
 9309 instruct vprorate(vec dst, vec src, vec shift) %{
 9310   match(Set dst (RotateLeftV src shift));
 9311   match(Set dst (RotateRightV src shift));
 9312   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9313   ins_encode %{
 9314     int opcode      = this->ideal_Opcode();
 9315     int vector_len  = vector_length_encoding(this);
 9316     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9317     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9318   %}
 9319   ins_pipe( pipe_slow );
 9320 %}
 9321 
 9322 // ---------------------------------- Masked Operations ------------------------------------
 9323 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9324   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9325   match(Set dst (LoadVectorMasked mem mask));
 9326   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9327   ins_encode %{
 9328     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9329     int vlen_enc = vector_length_encoding(this);
 9330     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9331   %}
 9332   ins_pipe( pipe_slow );
 9333 %}
 9334 
 9335 
 9336 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9337   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9338   match(Set dst (LoadVectorMasked mem mask));
 9339   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9340   ins_encode %{
 9341     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9342     int vector_len = vector_length_encoding(this);
 9343     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9344   %}
 9345   ins_pipe( pipe_slow );
 9346 %}
 9347 
 9348 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9349   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9350   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9351   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9352   ins_encode %{
 9353     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9354     int vlen_enc = vector_length_encoding(src_node);
 9355     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9356     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9357   %}
 9358   ins_pipe( pipe_slow );
 9359 %}
 9360 
 9361 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9362   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9363   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9364   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9365   ins_encode %{
 9366     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9367     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9368     int vlen_enc = vector_length_encoding(src_node);
 9369     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9370   %}
 9371   ins_pipe( pipe_slow );
 9372 %}
 9373 
 9374 #ifdef _LP64
 9375 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9376   match(Set addr (VerifyVectorAlignment addr mask));
 9377   effect(KILL cr);
 9378   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9379   ins_encode %{
 9380     Label Lskip;
 9381     // check if masked bits of addr are zero
 9382     __ testq($addr$$Register, $mask$$constant);
 9383     __ jccb(Assembler::equal, Lskip);
 9384     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9385     __ bind(Lskip);
 9386   %}
 9387   ins_pipe(pipe_slow);
 9388 %}
 9389 
 9390 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9391   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9392   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9393   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9394   ins_encode %{
 9395     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9396     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9397 
 9398     Label DONE;
 9399     int vlen_enc = vector_length_encoding(this, $src1);
 9400     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9401 
 9402     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9403     __ mov64($dst$$Register, -1L);
 9404     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9405     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9406     __ jccb(Assembler::carrySet, DONE);
 9407     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9408     __ notq($dst$$Register);
 9409     __ tzcntq($dst$$Register, $dst$$Register);
 9410     __ bind(DONE);
 9411   %}
 9412   ins_pipe( pipe_slow );
 9413 %}
 9414 
 9415 
 9416 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9417   match(Set dst (VectorMaskGen len));
 9418   effect(TEMP temp, KILL cr);
 9419   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9420   ins_encode %{
 9421     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9422   %}
 9423   ins_pipe( pipe_slow );
 9424 %}
 9425 
 9426 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9427   match(Set dst (VectorMaskGen len));
 9428   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9429   effect(TEMP temp);
 9430   ins_encode %{
 9431     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9432     __ kmovql($dst$$KRegister, $temp$$Register);
 9433   %}
 9434   ins_pipe( pipe_slow );
 9435 %}
 9436 
 9437 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9438   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9439   match(Set dst (VectorMaskToLong mask));
 9440   effect(TEMP dst, KILL cr);
 9441   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9442   ins_encode %{
 9443     int opcode = this->ideal_Opcode();
 9444     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9445     int mask_len = Matcher::vector_length(this, $mask);
 9446     int mask_size = mask_len * type2aelembytes(mbt);
 9447     int vlen_enc = vector_length_encoding(this, $mask);
 9448     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9449                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9450   %}
 9451   ins_pipe( pipe_slow );
 9452 %}
 9453 
 9454 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9455   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9456   match(Set dst (VectorMaskToLong mask));
 9457   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9458   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9459   ins_encode %{
 9460     int opcode = this->ideal_Opcode();
 9461     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9462     int mask_len = Matcher::vector_length(this, $mask);
 9463     int vlen_enc = vector_length_encoding(this, $mask);
 9464     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9465                              $dst$$Register, mask_len, mbt, vlen_enc);
 9466   %}
 9467   ins_pipe( pipe_slow );
 9468 %}
 9469 
 9470 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9471   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9472   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9473   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9474   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9475   ins_encode %{
 9476     int opcode = this->ideal_Opcode();
 9477     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9478     int mask_len = Matcher::vector_length(this, $mask);
 9479     int vlen_enc = vector_length_encoding(this, $mask);
 9480     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9481                              $dst$$Register, mask_len, mbt, vlen_enc);
 9482   %}
 9483   ins_pipe( pipe_slow );
 9484 %}
 9485 
 9486 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9487   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9488   match(Set dst (VectorMaskTrueCount mask));
 9489   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9490   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9491   ins_encode %{
 9492     int opcode = this->ideal_Opcode();
 9493     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9494     int mask_len = Matcher::vector_length(this, $mask);
 9495     int mask_size = mask_len * type2aelembytes(mbt);
 9496     int vlen_enc = vector_length_encoding(this, $mask);
 9497     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9498                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9499   %}
 9500   ins_pipe( pipe_slow );
 9501 %}
 9502 
 9503 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9504   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9505   match(Set dst (VectorMaskTrueCount mask));
 9506   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9507   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9508   ins_encode %{
 9509     int opcode = this->ideal_Opcode();
 9510     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9511     int mask_len = Matcher::vector_length(this, $mask);
 9512     int vlen_enc = vector_length_encoding(this, $mask);
 9513     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9514                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9515   %}
 9516   ins_pipe( pipe_slow );
 9517 %}
 9518 
 9519 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9520   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9521   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9522   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9523   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9524   ins_encode %{
 9525     int opcode = this->ideal_Opcode();
 9526     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9527     int mask_len = Matcher::vector_length(this, $mask);
 9528     int vlen_enc = vector_length_encoding(this, $mask);
 9529     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9530                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9531   %}
 9532   ins_pipe( pipe_slow );
 9533 %}
 9534 
 9535 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9536   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9537   match(Set dst (VectorMaskFirstTrue mask));
 9538   match(Set dst (VectorMaskLastTrue mask));
 9539   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9540   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9541   ins_encode %{
 9542     int opcode = this->ideal_Opcode();
 9543     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9544     int mask_len = Matcher::vector_length(this, $mask);
 9545     int mask_size = mask_len * type2aelembytes(mbt);
 9546     int vlen_enc = vector_length_encoding(this, $mask);
 9547     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9548                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9549   %}
 9550   ins_pipe( pipe_slow );
 9551 %}
 9552 
 9553 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9554   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9555   match(Set dst (VectorMaskFirstTrue mask));
 9556   match(Set dst (VectorMaskLastTrue mask));
 9557   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9558   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9559   ins_encode %{
 9560     int opcode = this->ideal_Opcode();
 9561     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9562     int mask_len = Matcher::vector_length(this, $mask);
 9563     int vlen_enc = vector_length_encoding(this, $mask);
 9564     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9565                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9566   %}
 9567   ins_pipe( pipe_slow );
 9568 %}
 9569 
 9570 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9571   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9572   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9573   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9574   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9575   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9576   ins_encode %{
 9577     int opcode = this->ideal_Opcode();
 9578     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9579     int mask_len = Matcher::vector_length(this, $mask);
 9580     int vlen_enc = vector_length_encoding(this, $mask);
 9581     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9582                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9583   %}
 9584   ins_pipe( pipe_slow );
 9585 %}
 9586 
 9587 // --------------------------------- Compress/Expand Operations ---------------------------
 9588 #ifdef _LP64
 9589 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9590   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9591   match(Set dst (CompressV src mask));
 9592   match(Set dst (ExpandV src mask));
 9593   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9594   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9595   ins_encode %{
 9596     int opcode = this->ideal_Opcode();
 9597     int vlen_enc = vector_length_encoding(this);
 9598     BasicType bt  = Matcher::vector_element_basic_type(this);
 9599     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9600                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9601   %}
 9602   ins_pipe( pipe_slow );
 9603 %}
 9604 #endif
 9605 
 9606 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9607   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9608   match(Set dst (CompressV src mask));
 9609   match(Set dst (ExpandV src mask));
 9610   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9611   ins_encode %{
 9612     int opcode = this->ideal_Opcode();
 9613     int vector_len = vector_length_encoding(this);
 9614     BasicType bt  = Matcher::vector_element_basic_type(this);
 9615     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9616   %}
 9617   ins_pipe( pipe_slow );
 9618 %}
 9619 
 9620 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9621   match(Set dst (CompressM mask));
 9622   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9623   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9624   ins_encode %{
 9625     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9626     int mask_len = Matcher::vector_length(this);
 9627     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9628   %}
 9629   ins_pipe( pipe_slow );
 9630 %}
 9631 
 9632 #endif // _LP64
 9633 
 9634 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9635 
 9636 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9637   predicate(!VM_Version::supports_gfni());
 9638   match(Set dst (ReverseV src));
 9639   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9640   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9641   ins_encode %{
 9642     int vec_enc = vector_length_encoding(this);
 9643     BasicType bt = Matcher::vector_element_basic_type(this);
 9644     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9645                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9646   %}
 9647   ins_pipe( pipe_slow );
 9648 %}
 9649 
 9650 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9651   predicate(VM_Version::supports_gfni());
 9652   match(Set dst (ReverseV src));
 9653   effect(TEMP dst, TEMP xtmp);
 9654   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9655   ins_encode %{
 9656     int vec_enc = vector_length_encoding(this);
 9657     BasicType bt  = Matcher::vector_element_basic_type(this);
 9658     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9659     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9660                                $xtmp$$XMMRegister);
 9661   %}
 9662   ins_pipe( pipe_slow );
 9663 %}
 9664 
 9665 instruct vreverse_byte_reg(vec dst, vec src) %{
 9666   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9667   match(Set dst (ReverseBytesV src));
 9668   effect(TEMP dst);
 9669   format %{ "vector_reverse_byte $dst, $src" %}
 9670   ins_encode %{
 9671     int vec_enc = vector_length_encoding(this);
 9672     BasicType bt = Matcher::vector_element_basic_type(this);
 9673     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9674   %}
 9675   ins_pipe( pipe_slow );
 9676 %}
 9677 
 9678 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9679   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9680   match(Set dst (ReverseBytesV src));
 9681   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9682   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9683   ins_encode %{
 9684     int vec_enc = vector_length_encoding(this);
 9685     BasicType bt = Matcher::vector_element_basic_type(this);
 9686     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9687                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9688   %}
 9689   ins_pipe( pipe_slow );
 9690 %}
 9691 
 9692 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9693 
 9694 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9695   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9696                                               Matcher::vector_length_in_bytes(n->in(1))));
 9697   match(Set dst (CountLeadingZerosV src));
 9698   format %{ "vector_count_leading_zeros $dst, $src" %}
 9699   ins_encode %{
 9700      int vlen_enc = vector_length_encoding(this, $src);
 9701      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9702      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9703                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9704   %}
 9705   ins_pipe( pipe_slow );
 9706 %}
 9707 
 9708 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9709   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9710                                               Matcher::vector_length_in_bytes(n->in(1))));
 9711   match(Set dst (CountLeadingZerosV src mask));
 9712   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9713   ins_encode %{
 9714     int vlen_enc = vector_length_encoding(this, $src);
 9715     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9716     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9717     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9718                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9719   %}
 9720   ins_pipe( pipe_slow );
 9721 %}
 9722 
 9723 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9724   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9725             VM_Version::supports_avx512cd() &&
 9726             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9727   match(Set dst (CountLeadingZerosV src));
 9728   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9729   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9730   ins_encode %{
 9731     int vlen_enc = vector_length_encoding(this, $src);
 9732     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9733     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9734                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9735   %}
 9736   ins_pipe( pipe_slow );
 9737 %}
 9738 
 9739 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9740   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9741   match(Set dst (CountLeadingZerosV src));
 9742   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9743   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9744   ins_encode %{
 9745     int vlen_enc = vector_length_encoding(this, $src);
 9746     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9747     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9748                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9749                                        $rtmp$$Register, true, vlen_enc);
 9750   %}
 9751   ins_pipe( pipe_slow );
 9752 %}
 9753 
 9754 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9755   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9756             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9757   match(Set dst (CountLeadingZerosV src));
 9758   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9759   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9760   ins_encode %{
 9761     int vlen_enc = vector_length_encoding(this, $src);
 9762     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9763     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9764                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9765   %}
 9766   ins_pipe( pipe_slow );
 9767 %}
 9768 
 9769 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9770   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9771             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9772   match(Set dst (CountLeadingZerosV src));
 9773   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9774   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9775   ins_encode %{
 9776     int vlen_enc = vector_length_encoding(this, $src);
 9777     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9778     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9779                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9780   %}
 9781   ins_pipe( pipe_slow );
 9782 %}
 9783 
 9784 // ---------------------------------- Vector Masked Operations ------------------------------------
 9785 
 9786 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9787   match(Set dst (AddVB (Binary dst src2) mask));
 9788   match(Set dst (AddVS (Binary dst src2) mask));
 9789   match(Set dst (AddVI (Binary dst src2) mask));
 9790   match(Set dst (AddVL (Binary dst src2) mask));
 9791   match(Set dst (AddVF (Binary dst src2) mask));
 9792   match(Set dst (AddVD (Binary dst src2) mask));
 9793   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9794   ins_encode %{
 9795     int vlen_enc = vector_length_encoding(this);
 9796     BasicType bt = Matcher::vector_element_basic_type(this);
 9797     int opc = this->ideal_Opcode();
 9798     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9799                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9800   %}
 9801   ins_pipe( pipe_slow );
 9802 %}
 9803 
 9804 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9805   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9806   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9807   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9808   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9809   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9810   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9811   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9812   ins_encode %{
 9813     int vlen_enc = vector_length_encoding(this);
 9814     BasicType bt = Matcher::vector_element_basic_type(this);
 9815     int opc = this->ideal_Opcode();
 9816     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9817                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9818   %}
 9819   ins_pipe( pipe_slow );
 9820 %}
 9821 
 9822 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9823   match(Set dst (XorV (Binary dst src2) mask));
 9824   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9825   ins_encode %{
 9826     int vlen_enc = vector_length_encoding(this);
 9827     BasicType bt = Matcher::vector_element_basic_type(this);
 9828     int opc = this->ideal_Opcode();
 9829     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9830                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9831   %}
 9832   ins_pipe( pipe_slow );
 9833 %}
 9834 
 9835 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9836   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9837   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9838   ins_encode %{
 9839     int vlen_enc = vector_length_encoding(this);
 9840     BasicType bt = Matcher::vector_element_basic_type(this);
 9841     int opc = this->ideal_Opcode();
 9842     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9843                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9844   %}
 9845   ins_pipe( pipe_slow );
 9846 %}
 9847 
 9848 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9849   match(Set dst (OrV (Binary dst src2) mask));
 9850   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9851   ins_encode %{
 9852     int vlen_enc = vector_length_encoding(this);
 9853     BasicType bt = Matcher::vector_element_basic_type(this);
 9854     int opc = this->ideal_Opcode();
 9855     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9856                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9857   %}
 9858   ins_pipe( pipe_slow );
 9859 %}
 9860 
 9861 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9862   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9863   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9864   ins_encode %{
 9865     int vlen_enc = vector_length_encoding(this);
 9866     BasicType bt = Matcher::vector_element_basic_type(this);
 9867     int opc = this->ideal_Opcode();
 9868     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9869                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9870   %}
 9871   ins_pipe( pipe_slow );
 9872 %}
 9873 
 9874 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9875   match(Set dst (AndV (Binary dst src2) mask));
 9876   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9877   ins_encode %{
 9878     int vlen_enc = vector_length_encoding(this);
 9879     BasicType bt = Matcher::vector_element_basic_type(this);
 9880     int opc = this->ideal_Opcode();
 9881     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9882                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9883   %}
 9884   ins_pipe( pipe_slow );
 9885 %}
 9886 
 9887 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9888   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9889   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9890   ins_encode %{
 9891     int vlen_enc = vector_length_encoding(this);
 9892     BasicType bt = Matcher::vector_element_basic_type(this);
 9893     int opc = this->ideal_Opcode();
 9894     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9895                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9896   %}
 9897   ins_pipe( pipe_slow );
 9898 %}
 9899 
 9900 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9901   match(Set dst (SubVB (Binary dst src2) mask));
 9902   match(Set dst (SubVS (Binary dst src2) mask));
 9903   match(Set dst (SubVI (Binary dst src2) mask));
 9904   match(Set dst (SubVL (Binary dst src2) mask));
 9905   match(Set dst (SubVF (Binary dst src2) mask));
 9906   match(Set dst (SubVD (Binary dst src2) mask));
 9907   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9908   ins_encode %{
 9909     int vlen_enc = vector_length_encoding(this);
 9910     BasicType bt = Matcher::vector_element_basic_type(this);
 9911     int opc = this->ideal_Opcode();
 9912     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9913                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9914   %}
 9915   ins_pipe( pipe_slow );
 9916 %}
 9917 
 9918 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9919   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9920   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9921   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9922   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9923   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9924   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9925   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9926   ins_encode %{
 9927     int vlen_enc = vector_length_encoding(this);
 9928     BasicType bt = Matcher::vector_element_basic_type(this);
 9929     int opc = this->ideal_Opcode();
 9930     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9931                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9932   %}
 9933   ins_pipe( pipe_slow );
 9934 %}
 9935 
 9936 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9937   match(Set dst (MulVS (Binary dst src2) mask));
 9938   match(Set dst (MulVI (Binary dst src2) mask));
 9939   match(Set dst (MulVL (Binary dst src2) mask));
 9940   match(Set dst (MulVF (Binary dst src2) mask));
 9941   match(Set dst (MulVD (Binary dst src2) mask));
 9942   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9943   ins_encode %{
 9944     int vlen_enc = vector_length_encoding(this);
 9945     BasicType bt = Matcher::vector_element_basic_type(this);
 9946     int opc = this->ideal_Opcode();
 9947     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9948                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9949   %}
 9950   ins_pipe( pipe_slow );
 9951 %}
 9952 
 9953 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9954   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9955   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9956   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9957   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9958   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9959   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9960   ins_encode %{
 9961     int vlen_enc = vector_length_encoding(this);
 9962     BasicType bt = Matcher::vector_element_basic_type(this);
 9963     int opc = this->ideal_Opcode();
 9964     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9965                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9966   %}
 9967   ins_pipe( pipe_slow );
 9968 %}
 9969 
 9970 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9971   match(Set dst (SqrtVF dst mask));
 9972   match(Set dst (SqrtVD dst mask));
 9973   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9974   ins_encode %{
 9975     int vlen_enc = vector_length_encoding(this);
 9976     BasicType bt = Matcher::vector_element_basic_type(this);
 9977     int opc = this->ideal_Opcode();
 9978     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9979                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9980   %}
 9981   ins_pipe( pipe_slow );
 9982 %}
 9983 
 9984 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9985   match(Set dst (DivVF (Binary dst src2) mask));
 9986   match(Set dst (DivVD (Binary dst src2) mask));
 9987   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9988   ins_encode %{
 9989     int vlen_enc = vector_length_encoding(this);
 9990     BasicType bt = Matcher::vector_element_basic_type(this);
 9991     int opc = this->ideal_Opcode();
 9992     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9993                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9994   %}
 9995   ins_pipe( pipe_slow );
 9996 %}
 9997 
 9998 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9999   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
10000   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
10001   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10002   ins_encode %{
10003     int vlen_enc = vector_length_encoding(this);
10004     BasicType bt = Matcher::vector_element_basic_type(this);
10005     int opc = this->ideal_Opcode();
10006     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10007                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10008   %}
10009   ins_pipe( pipe_slow );
10010 %}
10011 
10012 
10013 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10014   match(Set dst (RotateLeftV (Binary dst shift) mask));
10015   match(Set dst (RotateRightV (Binary dst shift) mask));
10016   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10017   ins_encode %{
10018     int vlen_enc = vector_length_encoding(this);
10019     BasicType bt = Matcher::vector_element_basic_type(this);
10020     int opc = this->ideal_Opcode();
10021     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10022                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10023   %}
10024   ins_pipe( pipe_slow );
10025 %}
10026 
10027 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10028   match(Set dst (RotateLeftV (Binary dst src2) mask));
10029   match(Set dst (RotateRightV (Binary dst src2) mask));
10030   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10031   ins_encode %{
10032     int vlen_enc = vector_length_encoding(this);
10033     BasicType bt = Matcher::vector_element_basic_type(this);
10034     int opc = this->ideal_Opcode();
10035     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10036                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10037   %}
10038   ins_pipe( pipe_slow );
10039 %}
10040 
10041 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10042   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10043   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10044   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10045   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10046   ins_encode %{
10047     int vlen_enc = vector_length_encoding(this);
10048     BasicType bt = Matcher::vector_element_basic_type(this);
10049     int opc = this->ideal_Opcode();
10050     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10051                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10052   %}
10053   ins_pipe( pipe_slow );
10054 %}
10055 
10056 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10057   predicate(!n->as_ShiftV()->is_var_shift());
10058   match(Set dst (LShiftVS (Binary dst src2) mask));
10059   match(Set dst (LShiftVI (Binary dst src2) mask));
10060   match(Set dst (LShiftVL (Binary dst src2) mask));
10061   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10062   ins_encode %{
10063     int vlen_enc = vector_length_encoding(this);
10064     BasicType bt = Matcher::vector_element_basic_type(this);
10065     int opc = this->ideal_Opcode();
10066     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10067                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10068   %}
10069   ins_pipe( pipe_slow );
10070 %}
10071 
10072 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10073   predicate(n->as_ShiftV()->is_var_shift());
10074   match(Set dst (LShiftVS (Binary dst src2) mask));
10075   match(Set dst (LShiftVI (Binary dst src2) mask));
10076   match(Set dst (LShiftVL (Binary dst src2) mask));
10077   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10078   ins_encode %{
10079     int vlen_enc = vector_length_encoding(this);
10080     BasicType bt = Matcher::vector_element_basic_type(this);
10081     int opc = this->ideal_Opcode();
10082     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10083                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10084   %}
10085   ins_pipe( pipe_slow );
10086 %}
10087 
10088 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10089   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10090   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10091   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10092   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10093   ins_encode %{
10094     int vlen_enc = vector_length_encoding(this);
10095     BasicType bt = Matcher::vector_element_basic_type(this);
10096     int opc = this->ideal_Opcode();
10097     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10098                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10099   %}
10100   ins_pipe( pipe_slow );
10101 %}
10102 
10103 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10104   predicate(!n->as_ShiftV()->is_var_shift());
10105   match(Set dst (RShiftVS (Binary dst src2) mask));
10106   match(Set dst (RShiftVI (Binary dst src2) mask));
10107   match(Set dst (RShiftVL (Binary dst src2) mask));
10108   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10109   ins_encode %{
10110     int vlen_enc = vector_length_encoding(this);
10111     BasicType bt = Matcher::vector_element_basic_type(this);
10112     int opc = this->ideal_Opcode();
10113     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10114                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10115   %}
10116   ins_pipe( pipe_slow );
10117 %}
10118 
10119 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10120   predicate(n->as_ShiftV()->is_var_shift());
10121   match(Set dst (RShiftVS (Binary dst src2) mask));
10122   match(Set dst (RShiftVI (Binary dst src2) mask));
10123   match(Set dst (RShiftVL (Binary dst src2) mask));
10124   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10125   ins_encode %{
10126     int vlen_enc = vector_length_encoding(this);
10127     BasicType bt = Matcher::vector_element_basic_type(this);
10128     int opc = this->ideal_Opcode();
10129     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10130                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10131   %}
10132   ins_pipe( pipe_slow );
10133 %}
10134 
10135 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10136   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10137   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10138   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10139   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10140   ins_encode %{
10141     int vlen_enc = vector_length_encoding(this);
10142     BasicType bt = Matcher::vector_element_basic_type(this);
10143     int opc = this->ideal_Opcode();
10144     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10145                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10146   %}
10147   ins_pipe( pipe_slow );
10148 %}
10149 
10150 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10151   predicate(!n->as_ShiftV()->is_var_shift());
10152   match(Set dst (URShiftVS (Binary dst src2) mask));
10153   match(Set dst (URShiftVI (Binary dst src2) mask));
10154   match(Set dst (URShiftVL (Binary dst src2) mask));
10155   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10156   ins_encode %{
10157     int vlen_enc = vector_length_encoding(this);
10158     BasicType bt = Matcher::vector_element_basic_type(this);
10159     int opc = this->ideal_Opcode();
10160     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10161                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10162   %}
10163   ins_pipe( pipe_slow );
10164 %}
10165 
10166 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10167   predicate(n->as_ShiftV()->is_var_shift());
10168   match(Set dst (URShiftVS (Binary dst src2) mask));
10169   match(Set dst (URShiftVI (Binary dst src2) mask));
10170   match(Set dst (URShiftVL (Binary dst src2) mask));
10171   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10172   ins_encode %{
10173     int vlen_enc = vector_length_encoding(this);
10174     BasicType bt = Matcher::vector_element_basic_type(this);
10175     int opc = this->ideal_Opcode();
10176     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10177                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10178   %}
10179   ins_pipe( pipe_slow );
10180 %}
10181 
10182 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10183   match(Set dst (MaxV (Binary dst src2) mask));
10184   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10185   ins_encode %{
10186     int vlen_enc = vector_length_encoding(this);
10187     BasicType bt = Matcher::vector_element_basic_type(this);
10188     int opc = this->ideal_Opcode();
10189     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10190                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10191   %}
10192   ins_pipe( pipe_slow );
10193 %}
10194 
10195 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10196   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10197   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10198   ins_encode %{
10199     int vlen_enc = vector_length_encoding(this);
10200     BasicType bt = Matcher::vector_element_basic_type(this);
10201     int opc = this->ideal_Opcode();
10202     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10203                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10204   %}
10205   ins_pipe( pipe_slow );
10206 %}
10207 
10208 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10209   match(Set dst (MinV (Binary dst src2) mask));
10210   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10211   ins_encode %{
10212     int vlen_enc = vector_length_encoding(this);
10213     BasicType bt = Matcher::vector_element_basic_type(this);
10214     int opc = this->ideal_Opcode();
10215     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10216                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10217   %}
10218   ins_pipe( pipe_slow );
10219 %}
10220 
10221 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10222   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10223   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10224   ins_encode %{
10225     int vlen_enc = vector_length_encoding(this);
10226     BasicType bt = Matcher::vector_element_basic_type(this);
10227     int opc = this->ideal_Opcode();
10228     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10229                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10230   %}
10231   ins_pipe( pipe_slow );
10232 %}
10233 
10234 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10235   match(Set dst (VectorRearrange (Binary dst src2) mask));
10236   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10237   ins_encode %{
10238     int vlen_enc = vector_length_encoding(this);
10239     BasicType bt = Matcher::vector_element_basic_type(this);
10240     int opc = this->ideal_Opcode();
10241     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10242                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10243   %}
10244   ins_pipe( pipe_slow );
10245 %}
10246 
10247 instruct vabs_masked(vec dst, kReg mask) %{
10248   match(Set dst (AbsVB dst mask));
10249   match(Set dst (AbsVS dst mask));
10250   match(Set dst (AbsVI dst mask));
10251   match(Set dst (AbsVL dst mask));
10252   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10253   ins_encode %{
10254     int vlen_enc = vector_length_encoding(this);
10255     BasicType bt = Matcher::vector_element_basic_type(this);
10256     int opc = this->ideal_Opcode();
10257     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10258                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10259   %}
10260   ins_pipe( pipe_slow );
10261 %}
10262 
10263 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10264   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10265   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10266   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10267   ins_encode %{
10268     assert(UseFMA, "Needs FMA instructions support.");
10269     int vlen_enc = vector_length_encoding(this);
10270     BasicType bt = Matcher::vector_element_basic_type(this);
10271     int opc = this->ideal_Opcode();
10272     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10273                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10274   %}
10275   ins_pipe( pipe_slow );
10276 %}
10277 
10278 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10279   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10280   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10281   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10282   ins_encode %{
10283     assert(UseFMA, "Needs FMA instructions support.");
10284     int vlen_enc = vector_length_encoding(this);
10285     BasicType bt = Matcher::vector_element_basic_type(this);
10286     int opc = this->ideal_Opcode();
10287     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10288                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10289   %}
10290   ins_pipe( pipe_slow );
10291 %}
10292 
10293 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10294   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10295   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10296   ins_encode %{
10297     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10298     int vlen_enc = vector_length_encoding(this, $src1);
10299     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10300 
10301     // Comparison i
10302     switch (src1_elem_bt) {
10303       case T_BYTE: {
10304         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10305         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10306         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10307         break;
10308       }
10309       case T_SHORT: {
10310         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10311         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10312         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10313         break;
10314       }
10315       case T_INT: {
10316         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10317         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10318         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10319         break;
10320       }
10321       case T_LONG: {
10322         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10323         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10324         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10325         break;
10326       }
10327       case T_FLOAT: {
10328         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10329         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10330         break;
10331       }
10332       case T_DOUBLE: {
10333         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10334         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10335         break;
10336       }
10337       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10338     }
10339   %}
10340   ins_pipe( pipe_slow );
10341 %}
10342 
10343 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10344   predicate(Matcher::vector_length(n) <= 32);
10345   match(Set dst (MaskAll src));
10346   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10347   ins_encode %{
10348     int mask_len = Matcher::vector_length(this);
10349     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10350   %}
10351   ins_pipe( pipe_slow );
10352 %}
10353 
10354 #ifdef _LP64
10355 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10356   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10357   match(Set dst (XorVMask src (MaskAll cnt)));
10358   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10359   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10360   ins_encode %{
10361     uint masklen = Matcher::vector_length(this);
10362     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10363   %}
10364   ins_pipe( pipe_slow );
10365 %}
10366 
10367 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10368   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10369             (Matcher::vector_length(n) == 16) ||
10370             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10371   match(Set dst (XorVMask src (MaskAll cnt)));
10372   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10373   ins_encode %{
10374     uint masklen = Matcher::vector_length(this);
10375     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10376   %}
10377   ins_pipe( pipe_slow );
10378 %}
10379 
10380 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10381   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10382   match(Set dst (VectorLongToMask src));
10383   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10384   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10385   ins_encode %{
10386     int mask_len = Matcher::vector_length(this);
10387     int vec_enc  = vector_length_encoding(mask_len);
10388     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10389                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10390   %}
10391   ins_pipe( pipe_slow );
10392 %}
10393 
10394 
10395 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10396   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10397   match(Set dst (VectorLongToMask src));
10398   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10399   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10400   ins_encode %{
10401     int mask_len = Matcher::vector_length(this);
10402     assert(mask_len <= 32, "invalid mask length");
10403     int vec_enc  = vector_length_encoding(mask_len);
10404     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10405                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10406   %}
10407   ins_pipe( pipe_slow );
10408 %}
10409 
10410 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10411   predicate(n->bottom_type()->isa_vectmask());
10412   match(Set dst (VectorLongToMask src));
10413   format %{ "long_to_mask_evex $dst, $src\t!" %}
10414   ins_encode %{
10415     __ kmov($dst$$KRegister, $src$$Register);
10416   %}
10417   ins_pipe( pipe_slow );
10418 %}
10419 #endif
10420 
10421 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10422   match(Set dst (AndVMask src1 src2));
10423   match(Set dst (OrVMask src1 src2));
10424   match(Set dst (XorVMask src1 src2));
10425   effect(TEMP kscratch);
10426   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10427   ins_encode %{
10428     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10429     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10430     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10431     uint masklen = Matcher::vector_length(this);
10432     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10433     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10434   %}
10435   ins_pipe( pipe_slow );
10436 %}
10437 
10438 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10439   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10440   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10441   ins_encode %{
10442     int vlen_enc = vector_length_encoding(this);
10443     BasicType bt = Matcher::vector_element_basic_type(this);
10444     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10445                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10446   %}
10447   ins_pipe( pipe_slow );
10448 %}
10449 
10450 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10451   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10452   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10453   ins_encode %{
10454     int vlen_enc = vector_length_encoding(this);
10455     BasicType bt = Matcher::vector_element_basic_type(this);
10456     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10457                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10458   %}
10459   ins_pipe( pipe_slow );
10460 %}
10461 
10462 instruct castMM(kReg dst)
10463 %{
10464   match(Set dst (CastVV dst));
10465 
10466   size(0);
10467   format %{ "# castVV of $dst" %}
10468   ins_encode(/* empty encoding */);
10469   ins_cost(0);
10470   ins_pipe(empty);
10471 %}
10472 
10473 instruct castVV(vec dst)
10474 %{
10475   match(Set dst (CastVV dst));
10476 
10477   size(0);
10478   format %{ "# castVV of $dst" %}
10479   ins_encode(/* empty encoding */);
10480   ins_cost(0);
10481   ins_pipe(empty);
10482 %}
10483 
10484 instruct castVVLeg(legVec dst)
10485 %{
10486   match(Set dst (CastVV dst));
10487 
10488   size(0);
10489   format %{ "# castVV of $dst" %}
10490   ins_encode(/* empty encoding */);
10491   ins_cost(0);
10492   ins_pipe(empty);
10493 %}
10494 
10495 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10496 %{
10497   match(Set dst (IsInfiniteF src));
10498   effect(TEMP ktmp, KILL cr);
10499   format %{ "float_class_check $dst, $src" %}
10500   ins_encode %{
10501     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10502     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10503   %}
10504   ins_pipe(pipe_slow);
10505 %}
10506 
10507 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10508 %{
10509   match(Set dst (IsInfiniteD src));
10510   effect(TEMP ktmp, KILL cr);
10511   format %{ "double_class_check $dst, $src" %}
10512   ins_encode %{
10513     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10514     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10515   %}
10516   ins_pipe(pipe_slow);
10517 %}
10518 
10519 
10520 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10521 %{
10522   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10523   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10524   ins_encode %{
10525     int vlen_enc = vector_length_encoding(this);
10526     BasicType bt = Matcher::vector_element_basic_type(this);
10527     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10528   %}
10529   ins_pipe(pipe_slow);
10530 %}