1 //
    2 // Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(C2_MacroAssembler *masm);
 1191   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   address base = __ start_a_stub(size_exception_handler());
 1314   if (base == nullptr) {
 1315     ciEnv::current()->record_failure("CodeCache is full");
 1316     return 0;  // CodeBuffer::expand failed
 1317   }
 1318   int offset = __ offset();
 1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1321   __ end_a_stub();
 1322   return offset;
 1323 }
 1324 
 1325 // Emit deopt handler code.
 1326 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1327 
 1328   // Note that the code buffer's insts_mark is always relative to insts.
 1329   // That's why we must use the macroassembler to generate a handler.
 1330   address base = __ start_a_stub(size_deopt_handler());
 1331   if (base == nullptr) {
 1332     ciEnv::current()->record_failure("CodeCache is full");
 1333     return 0;  // CodeBuffer::expand failed
 1334   }
 1335   int offset = __ offset();
 1336 
 1337 #ifdef _LP64
 1338   address the_pc = (address) __ pc();
 1339   Label next;
 1340   // push a "the_pc" on the stack without destroying any registers
 1341   // as they all may be live.
 1342 
 1343   // push address of "next"
 1344   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1345   __ bind(next);
 1346   // adjust it so it matches "the_pc"
 1347   __ subptr(Address(rsp, 0), __ offset() - offset);
 1348 #else
 1349   InternalAddress here(__ pc());
 1350   __ pushptr(here.addr(), noreg);
 1351 #endif
 1352 
 1353   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1354   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1355   __ end_a_stub();
 1356   return offset;
 1357 }
 1358 
 1359 static Assembler::Width widthForType(BasicType bt) {
 1360   if (bt == T_BYTE) {
 1361     return Assembler::B;
 1362   } else if (bt == T_SHORT) {
 1363     return Assembler::W;
 1364   } else if (bt == T_INT) {
 1365     return Assembler::D;
 1366   } else {
 1367     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1368     return Assembler::Q;
 1369   }
 1370 }
 1371 
 1372 //=============================================================================
 1373 
 1374   // Float masks come from different places depending on platform.
 1375 #ifdef _LP64
 1376   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1377   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1378   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1379   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1380 #else
 1381   static address float_signmask()  { return (address)float_signmask_pool; }
 1382   static address float_signflip()  { return (address)float_signflip_pool; }
 1383   static address double_signmask() { return (address)double_signmask_pool; }
 1384   static address double_signflip() { return (address)double_signflip_pool; }
 1385 #endif
 1386   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1387   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1388   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1389   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1390   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1391   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1392   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1393   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1394   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1395   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1396   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1397   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1398   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1399   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1400   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1401 
 1402 //=============================================================================
 1403 bool Matcher::match_rule_supported(int opcode) {
 1404   if (!has_match_rule(opcode)) {
 1405     return false; // no match rule present
 1406   }
 1407   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1408   switch (opcode) {
 1409     case Op_AbsVL:
 1410     case Op_StoreVectorScatter:
 1411       if (UseAVX < 3) {
 1412         return false;
 1413       }
 1414       break;
 1415     case Op_PopCountI:
 1416     case Op_PopCountL:
 1417       if (!UsePopCountInstruction) {
 1418         return false;
 1419       }
 1420       break;
 1421     case Op_PopCountVI:
 1422       if (UseAVX < 2) {
 1423         return false;
 1424       }
 1425       break;
 1426     case Op_CompressV:
 1427     case Op_ExpandV:
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572     case Op_LoadVectorGatherMasked:
 1573       if (UseAVX < 2) {
 1574         return false;
 1575       }
 1576       break;
 1577     case Op_FmaF:
 1578     case Op_FmaD:
 1579     case Op_FmaVD:
 1580     case Op_FmaVF:
 1581       if (!UseFMA) {
 1582         return false;
 1583       }
 1584       break;
 1585     case Op_MacroLogicV:
 1586       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1587         return false;
 1588       }
 1589       break;
 1590 
 1591     case Op_VectorCmpMasked:
 1592     case Op_VectorMaskGen:
 1593       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1594         return false;
 1595       }
 1596       break;
 1597     case Op_VectorMaskFirstTrue:
 1598     case Op_VectorMaskLastTrue:
 1599     case Op_VectorMaskTrueCount:
 1600     case Op_VectorMaskToLong:
 1601       if (!is_LP64 || UseAVX < 1) {
 1602          return false;
 1603       }
 1604       break;
 1605     case Op_RoundF:
 1606     case Op_RoundD:
 1607       if (!is_LP64) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_CopySignD:
 1612     case Op_CopySignF:
 1613       if (UseAVX < 3 || !is_LP64)  {
 1614         return false;
 1615       }
 1616       if (!VM_Version::supports_avx512vl()) {
 1617         return false;
 1618       }
 1619       break;
 1620 #ifndef _LP64
 1621     case Op_AddReductionVF:
 1622     case Op_AddReductionVD:
 1623     case Op_MulReductionVF:
 1624     case Op_MulReductionVD:
 1625       if (UseSSE < 1) { // requires at least SSE
 1626         return false;
 1627       }
 1628       break;
 1629     case Op_MulAddVS2VI:
 1630     case Op_RShiftVL:
 1631     case Op_AbsVD:
 1632     case Op_NegVD:
 1633       if (UseSSE < 2) {
 1634         return false;
 1635       }
 1636       break;
 1637 #endif // !LP64
 1638     case Op_CompressBits:
 1639       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_ExpandBits:
 1644       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1645         return false;
 1646       }
 1647       break;
 1648     case Op_SignumF:
 1649       if (UseSSE < 1) {
 1650         return false;
 1651       }
 1652       break;
 1653     case Op_SignumD:
 1654       if (UseSSE < 2) {
 1655         return false;
 1656       }
 1657       break;
 1658     case Op_CompressM:
 1659       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1660         return false;
 1661       }
 1662       break;
 1663     case Op_SqrtF:
 1664       if (UseSSE < 1) {
 1665         return false;
 1666       }
 1667       break;
 1668     case Op_SqrtD:
 1669 #ifdef _LP64
 1670       if (UseSSE < 2) {
 1671         return false;
 1672       }
 1673 #else
 1674       // x86_32.ad has a special match rule for SqrtD.
 1675       // Together with common x86 rules, this handles all UseSSE cases.
 1676 #endif
 1677       break;
 1678     case Op_ConvF2HF:
 1679     case Op_ConvHF2F:
 1680       if (!VM_Version::supports_float16()) {
 1681         return false;
 1682       }
 1683       break;
 1684     case Op_VectorCastF2HF:
 1685     case Op_VectorCastHF2F:
 1686       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1687         return false;
 1688       }
 1689       break;
 1690   }
 1691   return true;  // Match rules are supported by default.
 1692 }
 1693 
 1694 //------------------------------------------------------------------------
 1695 
 1696 static inline bool is_pop_count_instr_target(BasicType bt) {
 1697   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1698          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1699 }
 1700 
 1701 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1702   return match_rule_supported_vector(opcode, vlen, bt);
 1703 }
 1704 
 1705 // Identify extra cases that we might want to provide match rules for vector nodes and
 1706 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1707 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1708   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1709   if (!match_rule_supported(opcode)) {
 1710     return false;
 1711   }
 1712   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1713   //   * SSE2 supports 128bit vectors for all types;
 1714   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1715   //   * AVX2 supports 256bit vectors for all types;
 1716   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1717   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1718   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1719   // And MaxVectorSize is taken into account as well.
 1720   if (!vector_size_supported(bt, vlen)) {
 1721     return false;
 1722   }
 1723   // Special cases which require vector length follow:
 1724   //   * implementation limitations
 1725   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1726   //   * 128bit vroundpd instruction is present only in AVX1
 1727   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1728   switch (opcode) {
 1729     case Op_AbsVF:
 1730     case Op_NegVF:
 1731       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1732         return false; // 512bit vandps and vxorps are not available
 1733       }
 1734       break;
 1735     case Op_AbsVD:
 1736     case Op_NegVD:
 1737       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1738         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1739       }
 1740       break;
 1741     case Op_RotateRightV:
 1742     case Op_RotateLeftV:
 1743       if (bt != T_INT && bt != T_LONG) {
 1744         return false;
 1745       } // fallthrough
 1746     case Op_MacroLogicV:
 1747       if (!VM_Version::supports_evex() ||
 1748           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1749         return false;
 1750       }
 1751       break;
 1752     case Op_ClearArray:
 1753     case Op_VectorMaskGen:
 1754     case Op_VectorCmpMasked:
 1755       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1756         return false;
 1757       }
 1758       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1759         return false;
 1760       }
 1761       break;
 1762     case Op_LoadVectorMasked:
 1763     case Op_StoreVectorMasked:
 1764       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1765         return false;
 1766       }
 1767       break;
 1768     case Op_UMinV:
 1769     case Op_UMaxV:
 1770       if (UseAVX == 0) {
 1771         return false;
 1772       }
 1773       break;
 1774     case Op_MaxV:
 1775     case Op_MinV:
 1776       if (UseSSE < 4 && is_integral_type(bt)) {
 1777         return false;
 1778       }
 1779       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1780           // Float/Double intrinsics are enabled for AVX family currently.
 1781           if (UseAVX == 0) {
 1782             return false;
 1783           }
 1784           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1785             return false;
 1786           }
 1787       }
 1788       break;
 1789     case Op_CallLeafVector:
 1790       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1791         return false;
 1792       }
 1793       break;
 1794     case Op_AddReductionVI:
 1795       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1796         return false;
 1797       }
 1798       // fallthrough
 1799     case Op_AndReductionV:
 1800     case Op_OrReductionV:
 1801     case Op_XorReductionV:
 1802       if (is_subword_type(bt) && (UseSSE < 4)) {
 1803         return false;
 1804       }
 1805 #ifndef _LP64
 1806       if (bt == T_BYTE || bt == T_LONG) {
 1807         return false;
 1808       }
 1809 #endif
 1810       break;
 1811 #ifndef _LP64
 1812     case Op_VectorInsert:
 1813       if (bt == T_LONG || bt == T_DOUBLE) {
 1814         return false;
 1815       }
 1816       break;
 1817 #endif
 1818     case Op_MinReductionV:
 1819     case Op_MaxReductionV:
 1820       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1821         return false;
 1822       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1823         return false;
 1824       }
 1825       // Float/Double intrinsics enabled for AVX family.
 1826       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1827         return false;
 1828       }
 1829       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1830         return false;
 1831       }
 1832 #ifndef _LP64
 1833       if (bt == T_BYTE || bt == T_LONG) {
 1834         return false;
 1835       }
 1836 #endif
 1837       break;
 1838     case Op_VectorTest:
 1839       if (UseSSE < 4) {
 1840         return false; // Implementation limitation
 1841       } else if (size_in_bits < 32) {
 1842         return false; // Implementation limitation
 1843       }
 1844       break;
 1845     case Op_VectorLoadShuffle:
 1846     case Op_VectorRearrange:
 1847       if(vlen == 2) {
 1848         return false; // Implementation limitation due to how shuffle is loaded
 1849       } else if (size_in_bits == 256 && UseAVX < 2) {
 1850         return false; // Implementation limitation
 1851       }
 1852       break;
 1853     case Op_VectorLoadMask:
 1854     case Op_VectorMaskCast:
 1855       if (size_in_bits == 256 && UseAVX < 2) {
 1856         return false; // Implementation limitation
 1857       }
 1858       // fallthrough
 1859     case Op_VectorStoreMask:
 1860       if (vlen == 2) {
 1861         return false; // Implementation limitation
 1862       }
 1863       break;
 1864     case Op_PopulateIndex:
 1865       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1866         return false;
 1867       }
 1868       break;
 1869     case Op_VectorCastB2X:
 1870     case Op_VectorCastS2X:
 1871     case Op_VectorCastI2X:
 1872       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1873         return false;
 1874       }
 1875       break;
 1876     case Op_VectorCastL2X:
 1877       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1878         return false;
 1879       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1880         return false;
 1881       }
 1882       break;
 1883     case Op_VectorCastF2X: {
 1884         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1885         // happen after intermediate conversion to integer and special handling
 1886         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1887         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1888         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1889           return false;
 1890         }
 1891       }
 1892       // fallthrough
 1893     case Op_VectorCastD2X:
 1894       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1895         return false;
 1896       }
 1897       break;
 1898     case Op_VectorCastF2HF:
 1899     case Op_VectorCastHF2F:
 1900       if (!VM_Version::supports_f16c() &&
 1901          ((!VM_Version::supports_evex() ||
 1902          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1903         return false;
 1904       }
 1905       break;
 1906     case Op_RoundVD:
 1907       if (!VM_Version::supports_avx512dq()) {
 1908         return false;
 1909       }
 1910       break;
 1911     case Op_MulReductionVI:
 1912       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1913         return false;
 1914       }
 1915       break;
 1916     case Op_LoadVectorGatherMasked:
 1917       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1918         return false;
 1919       }
 1920       if (is_subword_type(bt) &&
 1921          (!is_LP64                                                ||
 1922          (size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1923          (size_in_bits < 64)                                      ||
 1924          (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1925         return false;
 1926       }
 1927       break;
 1928     case Op_StoreVectorScatterMasked:
 1929     case Op_StoreVectorScatter:
 1930       if (is_subword_type(bt)) {
 1931         return false;
 1932       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1933         return false;
 1934       }
 1935       // fallthrough
 1936     case Op_LoadVectorGather:
 1937       if (!is_subword_type(bt) && size_in_bits == 64) {
 1938         return false;
 1939       }
 1940       if (is_subword_type(bt) && size_in_bits < 64) {
 1941         return false;
 1942       }
 1943       break;
 1944     case Op_SaturatingAddV:
 1945     case Op_SaturatingSubV:
 1946       if (UseAVX < 1) {
 1947         return false; // Implementation limitation
 1948       }
 1949       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1950         return false;
 1951       }
 1952       break;
 1953     case Op_SelectFromTwoVector:
 1954        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1955          return false;
 1956        }
 1957        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1958          return false;
 1959        }
 1960        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1961          return false;
 1962        }
 1963        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1964          return false;
 1965        }
 1966        break;
 1967     case Op_MaskAll:
 1968       if (!VM_Version::supports_evex()) {
 1969         return false;
 1970       }
 1971       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1972         return false;
 1973       }
 1974       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1975         return false;
 1976       }
 1977       break;
 1978     case Op_VectorMaskCmp:
 1979       if (vlen < 2 || size_in_bits < 32) {
 1980         return false;
 1981       }
 1982       break;
 1983     case Op_CompressM:
 1984       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1985         return false;
 1986       }
 1987       break;
 1988     case Op_CompressV:
 1989     case Op_ExpandV:
 1990       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1991         return false;
 1992       }
 1993       if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
 1994         return false;
 1995       }
 1996       if (size_in_bits < 128 ) {
 1997         return false;
 1998       }
 1999     case Op_VectorLongToMask:
 2000       if (UseAVX < 1 || !is_LP64) {
 2001         return false;
 2002       }
 2003       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 2004         return false;
 2005       }
 2006       break;
 2007     case Op_SignumVD:
 2008     case Op_SignumVF:
 2009       if (UseAVX < 1) {
 2010         return false;
 2011       }
 2012       break;
 2013     case Op_PopCountVI:
 2014     case Op_PopCountVL: {
 2015         if (!is_pop_count_instr_target(bt) &&
 2016             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 2017           return false;
 2018         }
 2019       }
 2020       break;
 2021     case Op_ReverseV:
 2022     case Op_ReverseBytesV:
 2023       if (UseAVX < 2) {
 2024         return false;
 2025       }
 2026       break;
 2027     case Op_CountTrailingZerosV:
 2028     case Op_CountLeadingZerosV:
 2029       if (UseAVX < 2) {
 2030         return false;
 2031       }
 2032       break;
 2033   }
 2034   return true;  // Per default match rules are supported.
 2035 }
 2036 
 2037 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2038   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2039   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2040   // of their non-masked counterpart with mask edge being the differentiator.
 2041   // This routine does a strict check on the existence of masked operation patterns
 2042   // by returning a default false value for all the other opcodes apart from the
 2043   // ones whose masked instruction patterns are defined in this file.
 2044   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2045     return false;
 2046   }
 2047 
 2048   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2049   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2050   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2051     return false;
 2052   }
 2053   switch(opcode) {
 2054     // Unary masked operations
 2055     case Op_AbsVB:
 2056     case Op_AbsVS:
 2057       if(!VM_Version::supports_avx512bw()) {
 2058         return false;  // Implementation limitation
 2059       }
 2060     case Op_AbsVI:
 2061     case Op_AbsVL:
 2062       return true;
 2063 
 2064     // Ternary masked operations
 2065     case Op_FmaVF:
 2066     case Op_FmaVD:
 2067       return true;
 2068 
 2069     case Op_MacroLogicV:
 2070       if(bt != T_INT && bt != T_LONG) {
 2071         return false;
 2072       }
 2073       return true;
 2074 
 2075     // Binary masked operations
 2076     case Op_AddVB:
 2077     case Op_AddVS:
 2078     case Op_SubVB:
 2079     case Op_SubVS:
 2080     case Op_MulVS:
 2081     case Op_LShiftVS:
 2082     case Op_RShiftVS:
 2083     case Op_URShiftVS:
 2084       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2085       if (!VM_Version::supports_avx512bw()) {
 2086         return false;  // Implementation limitation
 2087       }
 2088       return true;
 2089 
 2090     case Op_MulVL:
 2091       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2092       if (!VM_Version::supports_avx512dq()) {
 2093         return false;  // Implementation limitation
 2094       }
 2095       return true;
 2096 
 2097     case Op_AndV:
 2098     case Op_OrV:
 2099     case Op_XorV:
 2100     case Op_RotateRightV:
 2101     case Op_RotateLeftV:
 2102       if (bt != T_INT && bt != T_LONG) {
 2103         return false; // Implementation limitation
 2104       }
 2105       return true;
 2106 
 2107     case Op_VectorLoadMask:
 2108       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2109       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2110         return false;
 2111       }
 2112       return true;
 2113 
 2114     case Op_AddVI:
 2115     case Op_AddVL:
 2116     case Op_AddVF:
 2117     case Op_AddVD:
 2118     case Op_SubVI:
 2119     case Op_SubVL:
 2120     case Op_SubVF:
 2121     case Op_SubVD:
 2122     case Op_MulVI:
 2123     case Op_MulVF:
 2124     case Op_MulVD:
 2125     case Op_DivVF:
 2126     case Op_DivVD:
 2127     case Op_SqrtVF:
 2128     case Op_SqrtVD:
 2129     case Op_LShiftVI:
 2130     case Op_LShiftVL:
 2131     case Op_RShiftVI:
 2132     case Op_RShiftVL:
 2133     case Op_URShiftVI:
 2134     case Op_URShiftVL:
 2135     case Op_LoadVectorMasked:
 2136     case Op_StoreVectorMasked:
 2137     case Op_LoadVectorGatherMasked:
 2138     case Op_StoreVectorScatterMasked:
 2139       return true;
 2140 
 2141     case Op_UMinV:
 2142     case Op_UMaxV:
 2143       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2144         return false;
 2145       } // fallthrough
 2146     case Op_MaxV:
 2147     case Op_MinV:
 2148       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2149         return false; // Implementation limitation
 2150       }
 2151       if (is_floating_point_type(bt)) {
 2152         return false; // Implementation limitation
 2153       }
 2154       return true;
 2155     case Op_SaturatingAddV:
 2156     case Op_SaturatingSubV:
 2157       if (!is_subword_type(bt)) {
 2158         return false;
 2159       }
 2160       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2161         return false; // Implementation limitation
 2162       }
 2163       return true;
 2164 
 2165     case Op_VectorMaskCmp:
 2166       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2167         return false; // Implementation limitation
 2168       }
 2169       return true;
 2170 
 2171     case Op_VectorRearrange:
 2172       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2173         return false; // Implementation limitation
 2174       }
 2175       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2176         return false; // Implementation limitation
 2177       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2178         return false; // Implementation limitation
 2179       }
 2180       return true;
 2181 
 2182     // Binary Logical operations
 2183     case Op_AndVMask:
 2184     case Op_OrVMask:
 2185     case Op_XorVMask:
 2186       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2187         return false; // Implementation limitation
 2188       }
 2189       return true;
 2190 
 2191     case Op_PopCountVI:
 2192     case Op_PopCountVL:
 2193       if (!is_pop_count_instr_target(bt)) {
 2194         return false;
 2195       }
 2196       return true;
 2197 
 2198     case Op_MaskAll:
 2199       return true;
 2200 
 2201     case Op_CountLeadingZerosV:
 2202       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2203         return true;
 2204       }
 2205     default:
 2206       return false;
 2207   }
 2208 }
 2209 
 2210 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2211   return false;
 2212 }
 2213 
 2214 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2215   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2216   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2217   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2218       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2219     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2220     return new legVecZOper();
 2221   }
 2222   if (legacy) {
 2223     switch (ideal_reg) {
 2224       case Op_VecS: return new legVecSOper();
 2225       case Op_VecD: return new legVecDOper();
 2226       case Op_VecX: return new legVecXOper();
 2227       case Op_VecY: return new legVecYOper();
 2228       case Op_VecZ: return new legVecZOper();
 2229     }
 2230   } else {
 2231     switch (ideal_reg) {
 2232       case Op_VecS: return new vecSOper();
 2233       case Op_VecD: return new vecDOper();
 2234       case Op_VecX: return new vecXOper();
 2235       case Op_VecY: return new vecYOper();
 2236       case Op_VecZ: return new vecZOper();
 2237     }
 2238   }
 2239   ShouldNotReachHere();
 2240   return nullptr;
 2241 }
 2242 
 2243 bool Matcher::is_reg2reg_move(MachNode* m) {
 2244   switch (m->rule()) {
 2245     case MoveVec2Leg_rule:
 2246     case MoveLeg2Vec_rule:
 2247     case MoveF2VL_rule:
 2248     case MoveF2LEG_rule:
 2249     case MoveVL2F_rule:
 2250     case MoveLEG2F_rule:
 2251     case MoveD2VL_rule:
 2252     case MoveD2LEG_rule:
 2253     case MoveVL2D_rule:
 2254     case MoveLEG2D_rule:
 2255       return true;
 2256     default:
 2257       return false;
 2258   }
 2259 }
 2260 
 2261 bool Matcher::is_generic_vector(MachOper* opnd) {
 2262   switch (opnd->opcode()) {
 2263     case VEC:
 2264     case LEGVEC:
 2265       return true;
 2266     default:
 2267       return false;
 2268   }
 2269 }
 2270 
 2271 //------------------------------------------------------------------------
 2272 
 2273 const RegMask* Matcher::predicate_reg_mask(void) {
 2274   return &_VECTMASK_REG_mask;
 2275 }
 2276 
 2277 // Max vector size in bytes. 0 if not supported.
 2278 int Matcher::vector_width_in_bytes(BasicType bt) {
 2279   assert(is_java_primitive(bt), "only primitive type vectors");
 2280   if (UseSSE < 2) return 0;
 2281   // SSE2 supports 128bit vectors for all types.
 2282   // AVX2 supports 256bit vectors for all types.
 2283   // AVX2/EVEX supports 512bit vectors for all types.
 2284   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2285   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2286   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2287     size = (UseAVX > 2) ? 64 : 32;
 2288   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2289     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2290   // Use flag to limit vector size.
 2291   size = MIN2(size,(int)MaxVectorSize);
 2292   // Minimum 2 values in vector (or 4 for bytes).
 2293   switch (bt) {
 2294   case T_DOUBLE:
 2295   case T_LONG:
 2296     if (size < 16) return 0;
 2297     break;
 2298   case T_FLOAT:
 2299   case T_INT:
 2300     if (size < 8) return 0;
 2301     break;
 2302   case T_BOOLEAN:
 2303     if (size < 4) return 0;
 2304     break;
 2305   case T_CHAR:
 2306     if (size < 4) return 0;
 2307     break;
 2308   case T_BYTE:
 2309     if (size < 4) return 0;
 2310     break;
 2311   case T_SHORT:
 2312     if (size < 4) return 0;
 2313     break;
 2314   default:
 2315     ShouldNotReachHere();
 2316   }
 2317   return size;
 2318 }
 2319 
 2320 // Limits on vector size (number of elements) loaded into vector.
 2321 int Matcher::max_vector_size(const BasicType bt) {
 2322   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2323 }
 2324 int Matcher::min_vector_size(const BasicType bt) {
 2325   int max_size = max_vector_size(bt);
 2326   // Min size which can be loaded into vector is 4 bytes.
 2327   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2328   // Support for calling svml double64 vectors
 2329   if (bt == T_DOUBLE) {
 2330     size = 1;
 2331   }
 2332   return MIN2(size,max_size);
 2333 }
 2334 
 2335 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2336   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2337   // by default on Cascade Lake
 2338   if (VM_Version::is_default_intel_cascade_lake()) {
 2339     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2340   }
 2341   return Matcher::max_vector_size(bt);
 2342 }
 2343 
 2344 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2345   return -1;
 2346 }
 2347 
 2348 // Vector ideal reg corresponding to specified size in bytes
 2349 uint Matcher::vector_ideal_reg(int size) {
 2350   assert(MaxVectorSize >= size, "");
 2351   switch(size) {
 2352     case  4: return Op_VecS;
 2353     case  8: return Op_VecD;
 2354     case 16: return Op_VecX;
 2355     case 32: return Op_VecY;
 2356     case 64: return Op_VecZ;
 2357   }
 2358   ShouldNotReachHere();
 2359   return 0;
 2360 }
 2361 
 2362 // Check for shift by small constant as well
 2363 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2364   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2365       shift->in(2)->get_int() <= 3 &&
 2366       // Are there other uses besides address expressions?
 2367       !matcher->is_visited(shift)) {
 2368     address_visited.set(shift->_idx); // Flag as address_visited
 2369     mstack.push(shift->in(2), Matcher::Visit);
 2370     Node *conv = shift->in(1);
 2371 #ifdef _LP64
 2372     // Allow Matcher to match the rule which bypass
 2373     // ConvI2L operation for an array index on LP64
 2374     // if the index value is positive.
 2375     if (conv->Opcode() == Op_ConvI2L &&
 2376         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2377         // Are there other uses besides address expressions?
 2378         !matcher->is_visited(conv)) {
 2379       address_visited.set(conv->_idx); // Flag as address_visited
 2380       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2381     } else
 2382 #endif
 2383       mstack.push(conv, Matcher::Pre_Visit);
 2384     return true;
 2385   }
 2386   return false;
 2387 }
 2388 
 2389 // This function identifies sub-graphs in which a 'load' node is
 2390 // input to two different nodes, and such that it can be matched
 2391 // with BMI instructions like blsi, blsr, etc.
 2392 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2393 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2394 // refers to the same node.
 2395 //
 2396 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2397 // This is a temporary solution until we make DAGs expressible in ADL.
 2398 template<typename ConType>
 2399 class FusedPatternMatcher {
 2400   Node* _op1_node;
 2401   Node* _mop_node;
 2402   int _con_op;
 2403 
 2404   static int match_next(Node* n, int next_op, int next_op_idx) {
 2405     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2406       return -1;
 2407     }
 2408 
 2409     if (next_op_idx == -1) { // n is commutative, try rotations
 2410       if (n->in(1)->Opcode() == next_op) {
 2411         return 1;
 2412       } else if (n->in(2)->Opcode() == next_op) {
 2413         return 2;
 2414       }
 2415     } else {
 2416       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2417       if (n->in(next_op_idx)->Opcode() == next_op) {
 2418         return next_op_idx;
 2419       }
 2420     }
 2421     return -1;
 2422   }
 2423 
 2424  public:
 2425   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2426     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2427 
 2428   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2429              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2430              typename ConType::NativeType con_value) {
 2431     if (_op1_node->Opcode() != op1) {
 2432       return false;
 2433     }
 2434     if (_mop_node->outcnt() > 2) {
 2435       return false;
 2436     }
 2437     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2438     if (op1_op2_idx == -1) {
 2439       return false;
 2440     }
 2441     // Memory operation must be the other edge
 2442     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2443 
 2444     // Check that the mop node is really what we want
 2445     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2446       Node* op2_node = _op1_node->in(op1_op2_idx);
 2447       if (op2_node->outcnt() > 1) {
 2448         return false;
 2449       }
 2450       assert(op2_node->Opcode() == op2, "Should be");
 2451       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2452       if (op2_con_idx == -1) {
 2453         return false;
 2454       }
 2455       // Memory operation must be the other edge
 2456       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2457       // Check that the memory operation is the same node
 2458       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2459         // Now check the constant
 2460         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2461         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2462           return true;
 2463         }
 2464       }
 2465     }
 2466     return false;
 2467   }
 2468 };
 2469 
 2470 static bool is_bmi_pattern(Node* n, Node* m) {
 2471   assert(UseBMI1Instructions, "sanity");
 2472   if (n != nullptr && m != nullptr) {
 2473     if (m->Opcode() == Op_LoadI) {
 2474       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2475       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2476              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2477              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2478     } else if (m->Opcode() == Op_LoadL) {
 2479       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2480       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2481              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2482              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2483     }
 2484   }
 2485   return false;
 2486 }
 2487 
 2488 // Should the matcher clone input 'm' of node 'n'?
 2489 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2490   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2491   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2492     mstack.push(m, Visit);
 2493     return true;
 2494   }
 2495   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2496     mstack.push(m, Visit);           // m = ShiftCntV
 2497     return true;
 2498   }
 2499   if (is_encode_and_store_pattern(n, m)) {
 2500     mstack.push(m, Visit);
 2501     return true;
 2502   }
 2503   return false;
 2504 }
 2505 
 2506 // Should the Matcher clone shifts on addressing modes, expecting them
 2507 // to be subsumed into complex addressing expressions or compute them
 2508 // into registers?
 2509 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2510   Node *off = m->in(AddPNode::Offset);
 2511   if (off->is_Con()) {
 2512     address_visited.test_set(m->_idx); // Flag as address_visited
 2513     Node *adr = m->in(AddPNode::Address);
 2514 
 2515     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2516     // AtomicAdd is not an addressing expression.
 2517     // Cheap to find it by looking for screwy base.
 2518     if (adr->is_AddP() &&
 2519         !adr->in(AddPNode::Base)->is_top() &&
 2520         !adr->in(AddPNode::Offset)->is_Con() &&
 2521         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2522         // Are there other uses besides address expressions?
 2523         !is_visited(adr)) {
 2524       address_visited.set(adr->_idx); // Flag as address_visited
 2525       Node *shift = adr->in(AddPNode::Offset);
 2526       if (!clone_shift(shift, this, mstack, address_visited)) {
 2527         mstack.push(shift, Pre_Visit);
 2528       }
 2529       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2530       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2531     } else {
 2532       mstack.push(adr, Pre_Visit);
 2533     }
 2534 
 2535     // Clone X+offset as it also folds into most addressing expressions
 2536     mstack.push(off, Visit);
 2537     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2538     return true;
 2539   } else if (clone_shift(off, this, mstack, address_visited)) {
 2540     address_visited.test_set(m->_idx); // Flag as address_visited
 2541     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2542     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2543     return true;
 2544   }
 2545   return false;
 2546 }
 2547 
 2548 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2549   switch (bt) {
 2550     case BoolTest::eq:
 2551       return Assembler::eq;
 2552     case BoolTest::ne:
 2553       return Assembler::neq;
 2554     case BoolTest::le:
 2555     case BoolTest::ule:
 2556       return Assembler::le;
 2557     case BoolTest::ge:
 2558     case BoolTest::uge:
 2559       return Assembler::nlt;
 2560     case BoolTest::lt:
 2561     case BoolTest::ult:
 2562       return Assembler::lt;
 2563     case BoolTest::gt:
 2564     case BoolTest::ugt:
 2565       return Assembler::nle;
 2566     default : ShouldNotReachHere(); return Assembler::_false;
 2567   }
 2568 }
 2569 
 2570 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2571   switch (bt) {
 2572   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2573   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2574   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2575   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2576   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2577   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2578   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2579   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2580   }
 2581 }
 2582 
 2583 // Helper methods for MachSpillCopyNode::implementation().
 2584 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2585                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2586   assert(ireg == Op_VecS || // 32bit vector
 2587          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2588           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2589          "no non-adjacent vector moves" );
 2590   if (masm) {
 2591     switch (ireg) {
 2592     case Op_VecS: // copy whole register
 2593     case Op_VecD:
 2594     case Op_VecX:
 2595 #ifndef _LP64
 2596       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2597 #else
 2598       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2599         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2600       } else {
 2601         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2602      }
 2603 #endif
 2604       break;
 2605     case Op_VecY:
 2606 #ifndef _LP64
 2607       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2608 #else
 2609       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2610         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2611       } else {
 2612         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2613      }
 2614 #endif
 2615       break;
 2616     case Op_VecZ:
 2617       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2618       break;
 2619     default:
 2620       ShouldNotReachHere();
 2621     }
 2622 #ifndef PRODUCT
 2623   } else {
 2624     switch (ireg) {
 2625     case Op_VecS:
 2626     case Op_VecD:
 2627     case Op_VecX:
 2628       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2629       break;
 2630     case Op_VecY:
 2631     case Op_VecZ:
 2632       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2633       break;
 2634     default:
 2635       ShouldNotReachHere();
 2636     }
 2637 #endif
 2638   }
 2639 }
 2640 
 2641 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2642                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2643   if (masm) {
 2644     if (is_load) {
 2645       switch (ireg) {
 2646       case Op_VecS:
 2647         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2648         break;
 2649       case Op_VecD:
 2650         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2651         break;
 2652       case Op_VecX:
 2653 #ifndef _LP64
 2654         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2655 #else
 2656         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2657           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2658         } else {
 2659           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2660           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2661         }
 2662 #endif
 2663         break;
 2664       case Op_VecY:
 2665 #ifndef _LP64
 2666         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2667 #else
 2668         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2669           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2670         } else {
 2671           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2672           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2673         }
 2674 #endif
 2675         break;
 2676       case Op_VecZ:
 2677         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2678         break;
 2679       default:
 2680         ShouldNotReachHere();
 2681       }
 2682     } else { // store
 2683       switch (ireg) {
 2684       case Op_VecS:
 2685         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2686         break;
 2687       case Op_VecD:
 2688         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2689         break;
 2690       case Op_VecX:
 2691 #ifndef _LP64
 2692         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2693 #else
 2694         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2695           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2696         }
 2697         else {
 2698           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2699         }
 2700 #endif
 2701         break;
 2702       case Op_VecY:
 2703 #ifndef _LP64
 2704         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2705 #else
 2706         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2707           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2708         }
 2709         else {
 2710           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2711         }
 2712 #endif
 2713         break;
 2714       case Op_VecZ:
 2715         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2716         break;
 2717       default:
 2718         ShouldNotReachHere();
 2719       }
 2720     }
 2721 #ifndef PRODUCT
 2722   } else {
 2723     if (is_load) {
 2724       switch (ireg) {
 2725       case Op_VecS:
 2726         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2727         break;
 2728       case Op_VecD:
 2729         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2730         break;
 2731        case Op_VecX:
 2732         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2733         break;
 2734       case Op_VecY:
 2735       case Op_VecZ:
 2736         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2737         break;
 2738       default:
 2739         ShouldNotReachHere();
 2740       }
 2741     } else { // store
 2742       switch (ireg) {
 2743       case Op_VecS:
 2744         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2745         break;
 2746       case Op_VecD:
 2747         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2748         break;
 2749        case Op_VecX:
 2750         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2751         break;
 2752       case Op_VecY:
 2753       case Op_VecZ:
 2754         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2755         break;
 2756       default:
 2757         ShouldNotReachHere();
 2758       }
 2759     }
 2760 #endif
 2761   }
 2762 }
 2763 
 2764 template <class T>
 2765 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2766   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2767   jvalue ele;
 2768   switch (bt) {
 2769     case T_BYTE:   ele.b = con; break;
 2770     case T_SHORT:  ele.s = con; break;
 2771     case T_INT:    ele.i = con; break;
 2772     case T_LONG:   ele.j = con; break;
 2773     case T_FLOAT:  ele.f = con; break;
 2774     case T_DOUBLE: ele.d = con; break;
 2775     default: ShouldNotReachHere();
 2776   }
 2777   for (int i = 0; i < len; i++) {
 2778     val->append(ele);
 2779   }
 2780   return val;
 2781 }
 2782 
 2783 static inline jlong high_bit_set(BasicType bt) {
 2784   switch (bt) {
 2785     case T_BYTE:  return 0x8080808080808080;
 2786     case T_SHORT: return 0x8000800080008000;
 2787     case T_INT:   return 0x8000000080000000;
 2788     case T_LONG:  return 0x8000000000000000;
 2789     default:
 2790       ShouldNotReachHere();
 2791       return 0;
 2792   }
 2793 }
 2794 
 2795 #ifndef PRODUCT
 2796   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2797     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2798   }
 2799 #endif
 2800 
 2801   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2802     __ nop(_count);
 2803   }
 2804 
 2805   uint MachNopNode::size(PhaseRegAlloc*) const {
 2806     return _count;
 2807   }
 2808 
 2809 #ifndef PRODUCT
 2810   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2811     st->print("# breakpoint");
 2812   }
 2813 #endif
 2814 
 2815   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2816     __ int3();
 2817   }
 2818 
 2819   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2820     return MachNode::size(ra_);
 2821   }
 2822 
 2823 %}
 2824 
 2825 encode %{
 2826 
 2827   enc_class call_epilog %{
 2828     if (VerifyStackAtCalls) {
 2829       // Check that stack depth is unchanged: find majik cookie on stack
 2830       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2831       Label L;
 2832       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2833       __ jccb(Assembler::equal, L);
 2834       // Die if stack mismatch
 2835       __ int3();
 2836       __ bind(L);
 2837     }
 2838   %}
 2839 
 2840 %}
 2841 
 2842 // Operands for bound floating pointer register arguments
 2843 operand rxmm0() %{
 2844   constraint(ALLOC_IN_RC(xmm0_reg));
 2845   match(VecX);
 2846   format%{%}
 2847   interface(REG_INTER);
 2848 %}
 2849 
 2850 //----------OPERANDS-----------------------------------------------------------
 2851 // Operand definitions must precede instruction definitions for correct parsing
 2852 // in the ADLC because operands constitute user defined types which are used in
 2853 // instruction definitions.
 2854 
 2855 // Vectors
 2856 
 2857 // Dummy generic vector class. Should be used for all vector operands.
 2858 // Replaced with vec[SDXYZ] during post-selection pass.
 2859 operand vec() %{
 2860   constraint(ALLOC_IN_RC(dynamic));
 2861   match(VecX);
 2862   match(VecY);
 2863   match(VecZ);
 2864   match(VecS);
 2865   match(VecD);
 2866 
 2867   format %{ %}
 2868   interface(REG_INTER);
 2869 %}
 2870 
 2871 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2872 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2873 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2874 // runtime code generation via reg_class_dynamic.
 2875 operand legVec() %{
 2876   constraint(ALLOC_IN_RC(dynamic));
 2877   match(VecX);
 2878   match(VecY);
 2879   match(VecZ);
 2880   match(VecS);
 2881   match(VecD);
 2882 
 2883   format %{ %}
 2884   interface(REG_INTER);
 2885 %}
 2886 
 2887 // Replaces vec during post-selection cleanup. See above.
 2888 operand vecS() %{
 2889   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2890   match(VecS);
 2891 
 2892   format %{ %}
 2893   interface(REG_INTER);
 2894 %}
 2895 
 2896 // Replaces legVec during post-selection cleanup. See above.
 2897 operand legVecS() %{
 2898   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2899   match(VecS);
 2900 
 2901   format %{ %}
 2902   interface(REG_INTER);
 2903 %}
 2904 
 2905 // Replaces vec during post-selection cleanup. See above.
 2906 operand vecD() %{
 2907   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2908   match(VecD);
 2909 
 2910   format %{ %}
 2911   interface(REG_INTER);
 2912 %}
 2913 
 2914 // Replaces legVec during post-selection cleanup. See above.
 2915 operand legVecD() %{
 2916   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2917   match(VecD);
 2918 
 2919   format %{ %}
 2920   interface(REG_INTER);
 2921 %}
 2922 
 2923 // Replaces vec during post-selection cleanup. See above.
 2924 operand vecX() %{
 2925   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2926   match(VecX);
 2927 
 2928   format %{ %}
 2929   interface(REG_INTER);
 2930 %}
 2931 
 2932 // Replaces legVec during post-selection cleanup. See above.
 2933 operand legVecX() %{
 2934   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2935   match(VecX);
 2936 
 2937   format %{ %}
 2938   interface(REG_INTER);
 2939 %}
 2940 
 2941 // Replaces vec during post-selection cleanup. See above.
 2942 operand vecY() %{
 2943   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2944   match(VecY);
 2945 
 2946   format %{ %}
 2947   interface(REG_INTER);
 2948 %}
 2949 
 2950 // Replaces legVec during post-selection cleanup. See above.
 2951 operand legVecY() %{
 2952   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2953   match(VecY);
 2954 
 2955   format %{ %}
 2956   interface(REG_INTER);
 2957 %}
 2958 
 2959 // Replaces vec during post-selection cleanup. See above.
 2960 operand vecZ() %{
 2961   constraint(ALLOC_IN_RC(vectorz_reg));
 2962   match(VecZ);
 2963 
 2964   format %{ %}
 2965   interface(REG_INTER);
 2966 %}
 2967 
 2968 // Replaces legVec during post-selection cleanup. See above.
 2969 operand legVecZ() %{
 2970   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2971   match(VecZ);
 2972 
 2973   format %{ %}
 2974   interface(REG_INTER);
 2975 %}
 2976 
 2977 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2978 
 2979 // ============================================================================
 2980 
 2981 instruct ShouldNotReachHere() %{
 2982   match(Halt);
 2983   format %{ "stop\t# ShouldNotReachHere" %}
 2984   ins_encode %{
 2985     if (is_reachable()) {
 2986       __ stop(_halt_reason);
 2987     }
 2988   %}
 2989   ins_pipe(pipe_slow);
 2990 %}
 2991 
 2992 // ============================================================================
 2993 
 2994 instruct addF_reg(regF dst, regF src) %{
 2995   predicate((UseSSE>=1) && (UseAVX == 0));
 2996   match(Set dst (AddF dst src));
 2997 
 2998   format %{ "addss   $dst, $src" %}
 2999   ins_cost(150);
 3000   ins_encode %{
 3001     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3002   %}
 3003   ins_pipe(pipe_slow);
 3004 %}
 3005 
 3006 instruct addF_mem(regF dst, memory src) %{
 3007   predicate((UseSSE>=1) && (UseAVX == 0));
 3008   match(Set dst (AddF dst (LoadF src)));
 3009 
 3010   format %{ "addss   $dst, $src" %}
 3011   ins_cost(150);
 3012   ins_encode %{
 3013     __ addss($dst$$XMMRegister, $src$$Address);
 3014   %}
 3015   ins_pipe(pipe_slow);
 3016 %}
 3017 
 3018 instruct addF_imm(regF dst, immF con) %{
 3019   predicate((UseSSE>=1) && (UseAVX == 0));
 3020   match(Set dst (AddF dst con));
 3021   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3022   ins_cost(150);
 3023   ins_encode %{
 3024     __ addss($dst$$XMMRegister, $constantaddress($con));
 3025   %}
 3026   ins_pipe(pipe_slow);
 3027 %}
 3028 
 3029 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3030   predicate(UseAVX > 0);
 3031   match(Set dst (AddF src1 src2));
 3032 
 3033   format %{ "vaddss  $dst, $src1, $src2" %}
 3034   ins_cost(150);
 3035   ins_encode %{
 3036     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3037   %}
 3038   ins_pipe(pipe_slow);
 3039 %}
 3040 
 3041 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3042   predicate(UseAVX > 0);
 3043   match(Set dst (AddF src1 (LoadF src2)));
 3044 
 3045   format %{ "vaddss  $dst, $src1, $src2" %}
 3046   ins_cost(150);
 3047   ins_encode %{
 3048     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3049   %}
 3050   ins_pipe(pipe_slow);
 3051 %}
 3052 
 3053 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3054   predicate(UseAVX > 0);
 3055   match(Set dst (AddF src con));
 3056 
 3057   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3058   ins_cost(150);
 3059   ins_encode %{
 3060     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3061   %}
 3062   ins_pipe(pipe_slow);
 3063 %}
 3064 
 3065 instruct addD_reg(regD dst, regD src) %{
 3066   predicate((UseSSE>=2) && (UseAVX == 0));
 3067   match(Set dst (AddD dst src));
 3068 
 3069   format %{ "addsd   $dst, $src" %}
 3070   ins_cost(150);
 3071   ins_encode %{
 3072     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3073   %}
 3074   ins_pipe(pipe_slow);
 3075 %}
 3076 
 3077 instruct addD_mem(regD dst, memory src) %{
 3078   predicate((UseSSE>=2) && (UseAVX == 0));
 3079   match(Set dst (AddD dst (LoadD src)));
 3080 
 3081   format %{ "addsd   $dst, $src" %}
 3082   ins_cost(150);
 3083   ins_encode %{
 3084     __ addsd($dst$$XMMRegister, $src$$Address);
 3085   %}
 3086   ins_pipe(pipe_slow);
 3087 %}
 3088 
 3089 instruct addD_imm(regD dst, immD con) %{
 3090   predicate((UseSSE>=2) && (UseAVX == 0));
 3091   match(Set dst (AddD dst con));
 3092   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3093   ins_cost(150);
 3094   ins_encode %{
 3095     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3096   %}
 3097   ins_pipe(pipe_slow);
 3098 %}
 3099 
 3100 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3101   predicate(UseAVX > 0);
 3102   match(Set dst (AddD src1 src2));
 3103 
 3104   format %{ "vaddsd  $dst, $src1, $src2" %}
 3105   ins_cost(150);
 3106   ins_encode %{
 3107     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3108   %}
 3109   ins_pipe(pipe_slow);
 3110 %}
 3111 
 3112 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3113   predicate(UseAVX > 0);
 3114   match(Set dst (AddD src1 (LoadD src2)));
 3115 
 3116   format %{ "vaddsd  $dst, $src1, $src2" %}
 3117   ins_cost(150);
 3118   ins_encode %{
 3119     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3120   %}
 3121   ins_pipe(pipe_slow);
 3122 %}
 3123 
 3124 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3125   predicate(UseAVX > 0);
 3126   match(Set dst (AddD src con));
 3127 
 3128   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3129   ins_cost(150);
 3130   ins_encode %{
 3131     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3132   %}
 3133   ins_pipe(pipe_slow);
 3134 %}
 3135 
 3136 instruct subF_reg(regF dst, regF src) %{
 3137   predicate((UseSSE>=1) && (UseAVX == 0));
 3138   match(Set dst (SubF dst src));
 3139 
 3140   format %{ "subss   $dst, $src" %}
 3141   ins_cost(150);
 3142   ins_encode %{
 3143     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3144   %}
 3145   ins_pipe(pipe_slow);
 3146 %}
 3147 
 3148 instruct subF_mem(regF dst, memory src) %{
 3149   predicate((UseSSE>=1) && (UseAVX == 0));
 3150   match(Set dst (SubF dst (LoadF src)));
 3151 
 3152   format %{ "subss   $dst, $src" %}
 3153   ins_cost(150);
 3154   ins_encode %{
 3155     __ subss($dst$$XMMRegister, $src$$Address);
 3156   %}
 3157   ins_pipe(pipe_slow);
 3158 %}
 3159 
 3160 instruct subF_imm(regF dst, immF con) %{
 3161   predicate((UseSSE>=1) && (UseAVX == 0));
 3162   match(Set dst (SubF dst con));
 3163   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3164   ins_cost(150);
 3165   ins_encode %{
 3166     __ subss($dst$$XMMRegister, $constantaddress($con));
 3167   %}
 3168   ins_pipe(pipe_slow);
 3169 %}
 3170 
 3171 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3172   predicate(UseAVX > 0);
 3173   match(Set dst (SubF src1 src2));
 3174 
 3175   format %{ "vsubss  $dst, $src1, $src2" %}
 3176   ins_cost(150);
 3177   ins_encode %{
 3178     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3179   %}
 3180   ins_pipe(pipe_slow);
 3181 %}
 3182 
 3183 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3184   predicate(UseAVX > 0);
 3185   match(Set dst (SubF src1 (LoadF src2)));
 3186 
 3187   format %{ "vsubss  $dst, $src1, $src2" %}
 3188   ins_cost(150);
 3189   ins_encode %{
 3190     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3191   %}
 3192   ins_pipe(pipe_slow);
 3193 %}
 3194 
 3195 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3196   predicate(UseAVX > 0);
 3197   match(Set dst (SubF src con));
 3198 
 3199   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3200   ins_cost(150);
 3201   ins_encode %{
 3202     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3203   %}
 3204   ins_pipe(pipe_slow);
 3205 %}
 3206 
 3207 instruct subD_reg(regD dst, regD src) %{
 3208   predicate((UseSSE>=2) && (UseAVX == 0));
 3209   match(Set dst (SubD dst src));
 3210 
 3211   format %{ "subsd   $dst, $src" %}
 3212   ins_cost(150);
 3213   ins_encode %{
 3214     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3215   %}
 3216   ins_pipe(pipe_slow);
 3217 %}
 3218 
 3219 instruct subD_mem(regD dst, memory src) %{
 3220   predicate((UseSSE>=2) && (UseAVX == 0));
 3221   match(Set dst (SubD dst (LoadD src)));
 3222 
 3223   format %{ "subsd   $dst, $src" %}
 3224   ins_cost(150);
 3225   ins_encode %{
 3226     __ subsd($dst$$XMMRegister, $src$$Address);
 3227   %}
 3228   ins_pipe(pipe_slow);
 3229 %}
 3230 
 3231 instruct subD_imm(regD dst, immD con) %{
 3232   predicate((UseSSE>=2) && (UseAVX == 0));
 3233   match(Set dst (SubD dst con));
 3234   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3235   ins_cost(150);
 3236   ins_encode %{
 3237     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3238   %}
 3239   ins_pipe(pipe_slow);
 3240 %}
 3241 
 3242 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3243   predicate(UseAVX > 0);
 3244   match(Set dst (SubD src1 src2));
 3245 
 3246   format %{ "vsubsd  $dst, $src1, $src2" %}
 3247   ins_cost(150);
 3248   ins_encode %{
 3249     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3250   %}
 3251   ins_pipe(pipe_slow);
 3252 %}
 3253 
 3254 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3255   predicate(UseAVX > 0);
 3256   match(Set dst (SubD src1 (LoadD src2)));
 3257 
 3258   format %{ "vsubsd  $dst, $src1, $src2" %}
 3259   ins_cost(150);
 3260   ins_encode %{
 3261     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3262   %}
 3263   ins_pipe(pipe_slow);
 3264 %}
 3265 
 3266 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3267   predicate(UseAVX > 0);
 3268   match(Set dst (SubD src con));
 3269 
 3270   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3271   ins_cost(150);
 3272   ins_encode %{
 3273     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3274   %}
 3275   ins_pipe(pipe_slow);
 3276 %}
 3277 
 3278 instruct mulF_reg(regF dst, regF src) %{
 3279   predicate((UseSSE>=1) && (UseAVX == 0));
 3280   match(Set dst (MulF dst src));
 3281 
 3282   format %{ "mulss   $dst, $src" %}
 3283   ins_cost(150);
 3284   ins_encode %{
 3285     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3286   %}
 3287   ins_pipe(pipe_slow);
 3288 %}
 3289 
 3290 instruct mulF_mem(regF dst, memory src) %{
 3291   predicate((UseSSE>=1) && (UseAVX == 0));
 3292   match(Set dst (MulF dst (LoadF src)));
 3293 
 3294   format %{ "mulss   $dst, $src" %}
 3295   ins_cost(150);
 3296   ins_encode %{
 3297     __ mulss($dst$$XMMRegister, $src$$Address);
 3298   %}
 3299   ins_pipe(pipe_slow);
 3300 %}
 3301 
 3302 instruct mulF_imm(regF dst, immF con) %{
 3303   predicate((UseSSE>=1) && (UseAVX == 0));
 3304   match(Set dst (MulF dst con));
 3305   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3306   ins_cost(150);
 3307   ins_encode %{
 3308     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3309   %}
 3310   ins_pipe(pipe_slow);
 3311 %}
 3312 
 3313 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3314   predicate(UseAVX > 0);
 3315   match(Set dst (MulF src1 src2));
 3316 
 3317   format %{ "vmulss  $dst, $src1, $src2" %}
 3318   ins_cost(150);
 3319   ins_encode %{
 3320     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3321   %}
 3322   ins_pipe(pipe_slow);
 3323 %}
 3324 
 3325 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3326   predicate(UseAVX > 0);
 3327   match(Set dst (MulF src1 (LoadF src2)));
 3328 
 3329   format %{ "vmulss  $dst, $src1, $src2" %}
 3330   ins_cost(150);
 3331   ins_encode %{
 3332     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3333   %}
 3334   ins_pipe(pipe_slow);
 3335 %}
 3336 
 3337 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3338   predicate(UseAVX > 0);
 3339   match(Set dst (MulF src con));
 3340 
 3341   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3342   ins_cost(150);
 3343   ins_encode %{
 3344     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3345   %}
 3346   ins_pipe(pipe_slow);
 3347 %}
 3348 
 3349 instruct mulD_reg(regD dst, regD src) %{
 3350   predicate((UseSSE>=2) && (UseAVX == 0));
 3351   match(Set dst (MulD dst src));
 3352 
 3353   format %{ "mulsd   $dst, $src" %}
 3354   ins_cost(150);
 3355   ins_encode %{
 3356     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3357   %}
 3358   ins_pipe(pipe_slow);
 3359 %}
 3360 
 3361 instruct mulD_mem(regD dst, memory src) %{
 3362   predicate((UseSSE>=2) && (UseAVX == 0));
 3363   match(Set dst (MulD dst (LoadD src)));
 3364 
 3365   format %{ "mulsd   $dst, $src" %}
 3366   ins_cost(150);
 3367   ins_encode %{
 3368     __ mulsd($dst$$XMMRegister, $src$$Address);
 3369   %}
 3370   ins_pipe(pipe_slow);
 3371 %}
 3372 
 3373 instruct mulD_imm(regD dst, immD con) %{
 3374   predicate((UseSSE>=2) && (UseAVX == 0));
 3375   match(Set dst (MulD dst con));
 3376   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3377   ins_cost(150);
 3378   ins_encode %{
 3379     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3380   %}
 3381   ins_pipe(pipe_slow);
 3382 %}
 3383 
 3384 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3385   predicate(UseAVX > 0);
 3386   match(Set dst (MulD src1 src2));
 3387 
 3388   format %{ "vmulsd  $dst, $src1, $src2" %}
 3389   ins_cost(150);
 3390   ins_encode %{
 3391     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3392   %}
 3393   ins_pipe(pipe_slow);
 3394 %}
 3395 
 3396 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3397   predicate(UseAVX > 0);
 3398   match(Set dst (MulD src1 (LoadD src2)));
 3399 
 3400   format %{ "vmulsd  $dst, $src1, $src2" %}
 3401   ins_cost(150);
 3402   ins_encode %{
 3403     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3404   %}
 3405   ins_pipe(pipe_slow);
 3406 %}
 3407 
 3408 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3409   predicate(UseAVX > 0);
 3410   match(Set dst (MulD src con));
 3411 
 3412   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3413   ins_cost(150);
 3414   ins_encode %{
 3415     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3416   %}
 3417   ins_pipe(pipe_slow);
 3418 %}
 3419 
 3420 instruct divF_reg(regF dst, regF src) %{
 3421   predicate((UseSSE>=1) && (UseAVX == 0));
 3422   match(Set dst (DivF dst src));
 3423 
 3424   format %{ "divss   $dst, $src" %}
 3425   ins_cost(150);
 3426   ins_encode %{
 3427     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3428   %}
 3429   ins_pipe(pipe_slow);
 3430 %}
 3431 
 3432 instruct divF_mem(regF dst, memory src) %{
 3433   predicate((UseSSE>=1) && (UseAVX == 0));
 3434   match(Set dst (DivF dst (LoadF src)));
 3435 
 3436   format %{ "divss   $dst, $src" %}
 3437   ins_cost(150);
 3438   ins_encode %{
 3439     __ divss($dst$$XMMRegister, $src$$Address);
 3440   %}
 3441   ins_pipe(pipe_slow);
 3442 %}
 3443 
 3444 instruct divF_imm(regF dst, immF con) %{
 3445   predicate((UseSSE>=1) && (UseAVX == 0));
 3446   match(Set dst (DivF dst con));
 3447   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3448   ins_cost(150);
 3449   ins_encode %{
 3450     __ divss($dst$$XMMRegister, $constantaddress($con));
 3451   %}
 3452   ins_pipe(pipe_slow);
 3453 %}
 3454 
 3455 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3456   predicate(UseAVX > 0);
 3457   match(Set dst (DivF src1 src2));
 3458 
 3459   format %{ "vdivss  $dst, $src1, $src2" %}
 3460   ins_cost(150);
 3461   ins_encode %{
 3462     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3463   %}
 3464   ins_pipe(pipe_slow);
 3465 %}
 3466 
 3467 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3468   predicate(UseAVX > 0);
 3469   match(Set dst (DivF src1 (LoadF src2)));
 3470 
 3471   format %{ "vdivss  $dst, $src1, $src2" %}
 3472   ins_cost(150);
 3473   ins_encode %{
 3474     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3475   %}
 3476   ins_pipe(pipe_slow);
 3477 %}
 3478 
 3479 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3480   predicate(UseAVX > 0);
 3481   match(Set dst (DivF src con));
 3482 
 3483   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3484   ins_cost(150);
 3485   ins_encode %{
 3486     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3487   %}
 3488   ins_pipe(pipe_slow);
 3489 %}
 3490 
 3491 instruct divD_reg(regD dst, regD src) %{
 3492   predicate((UseSSE>=2) && (UseAVX == 0));
 3493   match(Set dst (DivD dst src));
 3494 
 3495   format %{ "divsd   $dst, $src" %}
 3496   ins_cost(150);
 3497   ins_encode %{
 3498     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3499   %}
 3500   ins_pipe(pipe_slow);
 3501 %}
 3502 
 3503 instruct divD_mem(regD dst, memory src) %{
 3504   predicate((UseSSE>=2) && (UseAVX == 0));
 3505   match(Set dst (DivD dst (LoadD src)));
 3506 
 3507   format %{ "divsd   $dst, $src" %}
 3508   ins_cost(150);
 3509   ins_encode %{
 3510     __ divsd($dst$$XMMRegister, $src$$Address);
 3511   %}
 3512   ins_pipe(pipe_slow);
 3513 %}
 3514 
 3515 instruct divD_imm(regD dst, immD con) %{
 3516   predicate((UseSSE>=2) && (UseAVX == 0));
 3517   match(Set dst (DivD dst con));
 3518   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3519   ins_cost(150);
 3520   ins_encode %{
 3521     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3522   %}
 3523   ins_pipe(pipe_slow);
 3524 %}
 3525 
 3526 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3527   predicate(UseAVX > 0);
 3528   match(Set dst (DivD src1 src2));
 3529 
 3530   format %{ "vdivsd  $dst, $src1, $src2" %}
 3531   ins_cost(150);
 3532   ins_encode %{
 3533     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3534   %}
 3535   ins_pipe(pipe_slow);
 3536 %}
 3537 
 3538 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3539   predicate(UseAVX > 0);
 3540   match(Set dst (DivD src1 (LoadD src2)));
 3541 
 3542   format %{ "vdivsd  $dst, $src1, $src2" %}
 3543   ins_cost(150);
 3544   ins_encode %{
 3545     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3546   %}
 3547   ins_pipe(pipe_slow);
 3548 %}
 3549 
 3550 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3551   predicate(UseAVX > 0);
 3552   match(Set dst (DivD src con));
 3553 
 3554   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3555   ins_cost(150);
 3556   ins_encode %{
 3557     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3558   %}
 3559   ins_pipe(pipe_slow);
 3560 %}
 3561 
 3562 instruct absF_reg(regF dst) %{
 3563   predicate((UseSSE>=1) && (UseAVX == 0));
 3564   match(Set dst (AbsF dst));
 3565   ins_cost(150);
 3566   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3567   ins_encode %{
 3568     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3569   %}
 3570   ins_pipe(pipe_slow);
 3571 %}
 3572 
 3573 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3574   predicate(UseAVX > 0);
 3575   match(Set dst (AbsF src));
 3576   ins_cost(150);
 3577   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3578   ins_encode %{
 3579     int vlen_enc = Assembler::AVX_128bit;
 3580     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3581               ExternalAddress(float_signmask()), vlen_enc);
 3582   %}
 3583   ins_pipe(pipe_slow);
 3584 %}
 3585 
 3586 instruct absD_reg(regD dst) %{
 3587   predicate((UseSSE>=2) && (UseAVX == 0));
 3588   match(Set dst (AbsD dst));
 3589   ins_cost(150);
 3590   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3591             "# abs double by sign masking" %}
 3592   ins_encode %{
 3593     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3594   %}
 3595   ins_pipe(pipe_slow);
 3596 %}
 3597 
 3598 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3599   predicate(UseAVX > 0);
 3600   match(Set dst (AbsD src));
 3601   ins_cost(150);
 3602   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3603             "# abs double by sign masking" %}
 3604   ins_encode %{
 3605     int vlen_enc = Assembler::AVX_128bit;
 3606     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3607               ExternalAddress(double_signmask()), vlen_enc);
 3608   %}
 3609   ins_pipe(pipe_slow);
 3610 %}
 3611 
 3612 instruct negF_reg(regF dst) %{
 3613   predicate((UseSSE>=1) && (UseAVX == 0));
 3614   match(Set dst (NegF dst));
 3615   ins_cost(150);
 3616   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3617   ins_encode %{
 3618     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3619   %}
 3620   ins_pipe(pipe_slow);
 3621 %}
 3622 
 3623 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3624   predicate(UseAVX > 0);
 3625   match(Set dst (NegF src));
 3626   ins_cost(150);
 3627   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3628   ins_encode %{
 3629     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3630                  ExternalAddress(float_signflip()));
 3631   %}
 3632   ins_pipe(pipe_slow);
 3633 %}
 3634 
 3635 instruct negD_reg(regD dst) %{
 3636   predicate((UseSSE>=2) && (UseAVX == 0));
 3637   match(Set dst (NegD dst));
 3638   ins_cost(150);
 3639   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3640             "# neg double by sign flipping" %}
 3641   ins_encode %{
 3642     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3643   %}
 3644   ins_pipe(pipe_slow);
 3645 %}
 3646 
 3647 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3648   predicate(UseAVX > 0);
 3649   match(Set dst (NegD src));
 3650   ins_cost(150);
 3651   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3652             "# neg double by sign flipping" %}
 3653   ins_encode %{
 3654     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3655                  ExternalAddress(double_signflip()));
 3656   %}
 3657   ins_pipe(pipe_slow);
 3658 %}
 3659 
 3660 // sqrtss instruction needs destination register to be pre initialized for best performance
 3661 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3662 instruct sqrtF_reg(regF dst) %{
 3663   predicate(UseSSE>=1);
 3664   match(Set dst (SqrtF dst));
 3665   format %{ "sqrtss  $dst, $dst" %}
 3666   ins_encode %{
 3667     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3668   %}
 3669   ins_pipe(pipe_slow);
 3670 %}
 3671 
 3672 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3673 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3674 instruct sqrtD_reg(regD dst) %{
 3675   predicate(UseSSE>=2);
 3676   match(Set dst (SqrtD dst));
 3677   format %{ "sqrtsd  $dst, $dst" %}
 3678   ins_encode %{
 3679     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3680   %}
 3681   ins_pipe(pipe_slow);
 3682 %}
 3683 
 3684 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3685   effect(TEMP tmp);
 3686   match(Set dst (ConvF2HF src));
 3687   ins_cost(125);
 3688   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3689   ins_encode %{
 3690     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3691   %}
 3692   ins_pipe( pipe_slow );
 3693 %}
 3694 
 3695 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3696   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3697   effect(TEMP ktmp, TEMP rtmp);
 3698   match(Set mem (StoreC mem (ConvF2HF src)));
 3699   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3700   ins_encode %{
 3701     __ movl($rtmp$$Register, 0x1);
 3702     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3703     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3704   %}
 3705   ins_pipe( pipe_slow );
 3706 %}
 3707 
 3708 instruct vconvF2HF(vec dst, vec src) %{
 3709   match(Set dst (VectorCastF2HF src));
 3710   format %{ "vector_conv_F2HF $dst $src" %}
 3711   ins_encode %{
 3712     int vlen_enc = vector_length_encoding(this, $src);
 3713     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3714   %}
 3715   ins_pipe( pipe_slow );
 3716 %}
 3717 
 3718 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3719   predicate(n->as_StoreVector()->memory_size() >= 16);
 3720   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3721   format %{ "vcvtps2ph $mem,$src" %}
 3722   ins_encode %{
 3723     int vlen_enc = vector_length_encoding(this, $src);
 3724     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3725   %}
 3726   ins_pipe( pipe_slow );
 3727 %}
 3728 
 3729 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3730   match(Set dst (ConvHF2F src));
 3731   format %{ "vcvtph2ps $dst,$src" %}
 3732   ins_encode %{
 3733     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3734   %}
 3735   ins_pipe( pipe_slow );
 3736 %}
 3737 
 3738 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3739   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3740   format %{ "vcvtph2ps $dst,$mem" %}
 3741   ins_encode %{
 3742     int vlen_enc = vector_length_encoding(this);
 3743     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3744   %}
 3745   ins_pipe( pipe_slow );
 3746 %}
 3747 
 3748 instruct vconvHF2F(vec dst, vec src) %{
 3749   match(Set dst (VectorCastHF2F src));
 3750   ins_cost(125);
 3751   format %{ "vector_conv_HF2F $dst,$src" %}
 3752   ins_encode %{
 3753     int vlen_enc = vector_length_encoding(this);
 3754     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3755   %}
 3756   ins_pipe( pipe_slow );
 3757 %}
 3758 
 3759 // ---------------------------------------- VectorReinterpret ------------------------------------
 3760 instruct reinterpret_mask(kReg dst) %{
 3761   predicate(n->bottom_type()->isa_vectmask() &&
 3762             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3763   match(Set dst (VectorReinterpret dst));
 3764   ins_cost(125);
 3765   format %{ "vector_reinterpret $dst\t!" %}
 3766   ins_encode %{
 3767     // empty
 3768   %}
 3769   ins_pipe( pipe_slow );
 3770 %}
 3771 
 3772 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3773   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3774             n->bottom_type()->isa_vectmask() &&
 3775             n->in(1)->bottom_type()->isa_vectmask() &&
 3776             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3777             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3778   match(Set dst (VectorReinterpret src));
 3779   effect(TEMP xtmp);
 3780   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3781   ins_encode %{
 3782      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3783      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3784      assert(src_sz == dst_sz , "src and dst size mismatch");
 3785      int vlen_enc = vector_length_encoding(src_sz);
 3786      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3787      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3788   %}
 3789   ins_pipe( pipe_slow );
 3790 %}
 3791 
 3792 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3793   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3794             n->bottom_type()->isa_vectmask() &&
 3795             n->in(1)->bottom_type()->isa_vectmask() &&
 3796             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3797              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3798             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3799   match(Set dst (VectorReinterpret src));
 3800   effect(TEMP xtmp);
 3801   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3802   ins_encode %{
 3803      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3804      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3805      assert(src_sz == dst_sz , "src and dst size mismatch");
 3806      int vlen_enc = vector_length_encoding(src_sz);
 3807      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3808      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3809   %}
 3810   ins_pipe( pipe_slow );
 3811 %}
 3812 
 3813 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3814   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3815             n->bottom_type()->isa_vectmask() &&
 3816             n->in(1)->bottom_type()->isa_vectmask() &&
 3817             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3818              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3819             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3820   match(Set dst (VectorReinterpret src));
 3821   effect(TEMP xtmp);
 3822   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3823   ins_encode %{
 3824      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3825      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3826      assert(src_sz == dst_sz , "src and dst size mismatch");
 3827      int vlen_enc = vector_length_encoding(src_sz);
 3828      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3829      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3830   %}
 3831   ins_pipe( pipe_slow );
 3832 %}
 3833 
 3834 instruct reinterpret(vec dst) %{
 3835   predicate(!n->bottom_type()->isa_vectmask() &&
 3836             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3837   match(Set dst (VectorReinterpret dst));
 3838   ins_cost(125);
 3839   format %{ "vector_reinterpret $dst\t!" %}
 3840   ins_encode %{
 3841     // empty
 3842   %}
 3843   ins_pipe( pipe_slow );
 3844 %}
 3845 
 3846 instruct reinterpret_expand(vec dst, vec src) %{
 3847   predicate(UseAVX == 0 &&
 3848             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3849   match(Set dst (VectorReinterpret src));
 3850   ins_cost(125);
 3851   effect(TEMP dst);
 3852   format %{ "vector_reinterpret_expand $dst,$src" %}
 3853   ins_encode %{
 3854     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3855     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3856 
 3857     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3858     if (src_vlen_in_bytes == 4) {
 3859       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3860     } else {
 3861       assert(src_vlen_in_bytes == 8, "");
 3862       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3863     }
 3864     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3865   %}
 3866   ins_pipe( pipe_slow );
 3867 %}
 3868 
 3869 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3870   predicate(UseAVX > 0 &&
 3871             !n->bottom_type()->isa_vectmask() &&
 3872             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3873             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3874   match(Set dst (VectorReinterpret src));
 3875   ins_cost(125);
 3876   format %{ "vector_reinterpret_expand $dst,$src" %}
 3877   ins_encode %{
 3878     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3879   %}
 3880   ins_pipe( pipe_slow );
 3881 %}
 3882 
 3883 
 3884 instruct vreinterpret_expand(legVec dst, vec src) %{
 3885   predicate(UseAVX > 0 &&
 3886             !n->bottom_type()->isa_vectmask() &&
 3887             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3888             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3889   match(Set dst (VectorReinterpret src));
 3890   ins_cost(125);
 3891   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3892   ins_encode %{
 3893     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3894       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3895       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3896       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3897       default: ShouldNotReachHere();
 3898     }
 3899   %}
 3900   ins_pipe( pipe_slow );
 3901 %}
 3902 
 3903 instruct reinterpret_shrink(vec dst, legVec src) %{
 3904   predicate(!n->bottom_type()->isa_vectmask() &&
 3905             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3906   match(Set dst (VectorReinterpret src));
 3907   ins_cost(125);
 3908   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3909   ins_encode %{
 3910     switch (Matcher::vector_length_in_bytes(this)) {
 3911       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3912       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3913       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3914       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3915       default: ShouldNotReachHere();
 3916     }
 3917   %}
 3918   ins_pipe( pipe_slow );
 3919 %}
 3920 
 3921 // ----------------------------------------------------------------------------------------------------
 3922 
 3923 #ifdef _LP64
 3924 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3925   match(Set dst (RoundDoubleMode src rmode));
 3926   format %{ "roundsd $dst,$src" %}
 3927   ins_cost(150);
 3928   ins_encode %{
 3929     assert(UseSSE >= 4, "required");
 3930     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3931       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3932     }
 3933     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3934   %}
 3935   ins_pipe(pipe_slow);
 3936 %}
 3937 
 3938 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3939   match(Set dst (RoundDoubleMode con rmode));
 3940   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3941   ins_cost(150);
 3942   ins_encode %{
 3943     assert(UseSSE >= 4, "required");
 3944     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3945   %}
 3946   ins_pipe(pipe_slow);
 3947 %}
 3948 
 3949 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3950   predicate(Matcher::vector_length(n) < 8);
 3951   match(Set dst (RoundDoubleModeV src rmode));
 3952   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3953   ins_encode %{
 3954     assert(UseAVX > 0, "required");
 3955     int vlen_enc = vector_length_encoding(this);
 3956     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3957   %}
 3958   ins_pipe( pipe_slow );
 3959 %}
 3960 
 3961 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3962   predicate(Matcher::vector_length(n) == 8);
 3963   match(Set dst (RoundDoubleModeV src rmode));
 3964   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3965   ins_encode %{
 3966     assert(UseAVX > 2, "required");
 3967     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3968   %}
 3969   ins_pipe( pipe_slow );
 3970 %}
 3971 
 3972 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3973   predicate(Matcher::vector_length(n) < 8);
 3974   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3975   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3976   ins_encode %{
 3977     assert(UseAVX > 0, "required");
 3978     int vlen_enc = vector_length_encoding(this);
 3979     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3980   %}
 3981   ins_pipe( pipe_slow );
 3982 %}
 3983 
 3984 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3985   predicate(Matcher::vector_length(n) == 8);
 3986   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3987   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3988   ins_encode %{
 3989     assert(UseAVX > 2, "required");
 3990     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3991   %}
 3992   ins_pipe( pipe_slow );
 3993 %}
 3994 #endif // _LP64
 3995 
 3996 instruct onspinwait() %{
 3997   match(OnSpinWait);
 3998   ins_cost(200);
 3999 
 4000   format %{
 4001     $$template
 4002     $$emit$$"pause\t! membar_onspinwait"
 4003   %}
 4004   ins_encode %{
 4005     __ pause();
 4006   %}
 4007   ins_pipe(pipe_slow);
 4008 %}
 4009 
 4010 // a * b + c
 4011 instruct fmaD_reg(regD a, regD b, regD c) %{
 4012   match(Set c (FmaD  c (Binary a b)));
 4013   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4014   ins_cost(150);
 4015   ins_encode %{
 4016     assert(UseFMA, "Needs FMA instructions support.");
 4017     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4018   %}
 4019   ins_pipe( pipe_slow );
 4020 %}
 4021 
 4022 // a * b + c
 4023 instruct fmaF_reg(regF a, regF b, regF c) %{
 4024   match(Set c (FmaF  c (Binary a b)));
 4025   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4026   ins_cost(150);
 4027   ins_encode %{
 4028     assert(UseFMA, "Needs FMA instructions support.");
 4029     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4030   %}
 4031   ins_pipe( pipe_slow );
 4032 %}
 4033 
 4034 // ====================VECTOR INSTRUCTIONS=====================================
 4035 
 4036 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4037 instruct MoveVec2Leg(legVec dst, vec src) %{
 4038   match(Set dst src);
 4039   format %{ "" %}
 4040   ins_encode %{
 4041     ShouldNotReachHere();
 4042   %}
 4043   ins_pipe( fpu_reg_reg );
 4044 %}
 4045 
 4046 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4047   match(Set dst src);
 4048   format %{ "" %}
 4049   ins_encode %{
 4050     ShouldNotReachHere();
 4051   %}
 4052   ins_pipe( fpu_reg_reg );
 4053 %}
 4054 
 4055 // ============================================================================
 4056 
 4057 // Load vectors generic operand pattern
 4058 instruct loadV(vec dst, memory mem) %{
 4059   match(Set dst (LoadVector mem));
 4060   ins_cost(125);
 4061   format %{ "load_vector $dst,$mem" %}
 4062   ins_encode %{
 4063     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4064   %}
 4065   ins_pipe( pipe_slow );
 4066 %}
 4067 
 4068 // Store vectors generic operand pattern.
 4069 instruct storeV(memory mem, vec src) %{
 4070   match(Set mem (StoreVector mem src));
 4071   ins_cost(145);
 4072   format %{ "store_vector $mem,$src\n\t" %}
 4073   ins_encode %{
 4074     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4075       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4076       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4077       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4078       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4079       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4080       default: ShouldNotReachHere();
 4081     }
 4082   %}
 4083   ins_pipe( pipe_slow );
 4084 %}
 4085 
 4086 // ---------------------------------------- Gather ------------------------------------
 4087 
 4088 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4089 
 4090 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4091   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4092             Matcher::vector_length_in_bytes(n) <= 32);
 4093   match(Set dst (LoadVectorGather mem idx));
 4094   effect(TEMP dst, TEMP tmp, TEMP mask);
 4095   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4096   ins_encode %{
 4097     int vlen_enc = vector_length_encoding(this);
 4098     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4099     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4100     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4101     __ lea($tmp$$Register, $mem$$Address);
 4102     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4103   %}
 4104   ins_pipe( pipe_slow );
 4105 %}
 4106 
 4107 
 4108 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4109   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4110             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4111   match(Set dst (LoadVectorGather mem idx));
 4112   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4113   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4114   ins_encode %{
 4115     int vlen_enc = vector_length_encoding(this);
 4116     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4117     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4118     __ lea($tmp$$Register, $mem$$Address);
 4119     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4120   %}
 4121   ins_pipe( pipe_slow );
 4122 %}
 4123 
 4124 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4125   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4126             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4127   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4128   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4129   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4130   ins_encode %{
 4131     assert(UseAVX > 2, "sanity");
 4132     int vlen_enc = vector_length_encoding(this);
 4133     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4134     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4135     // Note: Since gather instruction partially updates the opmask register used
 4136     // for predication hense moving mask operand to a temporary.
 4137     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4138     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4139     __ lea($tmp$$Register, $mem$$Address);
 4140     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4141   %}
 4142   ins_pipe( pipe_slow );
 4143 %}
 4144 
 4145 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4146   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4147   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4148   effect(TEMP tmp, TEMP rtmp);
 4149   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4150   ins_encode %{
 4151     int vlen_enc = vector_length_encoding(this);
 4152     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4153     __ lea($tmp$$Register, $mem$$Address);
 4154     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4155   %}
 4156   ins_pipe( pipe_slow );
 4157 %}
 4158 
 4159 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4160                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4161   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4162   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4163   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4164   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4165   ins_encode %{
 4166     int vlen_enc = vector_length_encoding(this);
 4167     int vector_len = Matcher::vector_length(this);
 4168     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4169     __ lea($tmp$$Register, $mem$$Address);
 4170     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4171     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4172                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4173   %}
 4174   ins_pipe( pipe_slow );
 4175 %}
 4176 
 4177 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4178   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4179   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4180   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4181   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4182   ins_encode %{
 4183     int vlen_enc = vector_length_encoding(this);
 4184     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4185     __ lea($tmp$$Register, $mem$$Address);
 4186     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4187   %}
 4188   ins_pipe( pipe_slow );
 4189 %}
 4190 
 4191 
 4192 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4193                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4194   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4195   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4196   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4197   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4198   ins_encode %{
 4199     int vlen_enc = vector_length_encoding(this);
 4200     int vector_len = Matcher::vector_length(this);
 4201     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4202     __ lea($tmp$$Register, $mem$$Address);
 4203     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4204     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4205                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4206   %}
 4207   ins_pipe( pipe_slow );
 4208 %}
 4209 
 4210 
 4211 #ifdef _LP64
 4212 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4213   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4214   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4215   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4216   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4217   ins_encode %{
 4218     int vlen_enc = vector_length_encoding(this);
 4219     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4220     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4221     __ lea($tmp$$Register, $mem$$Address);
 4222     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4223     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4224   %}
 4225   ins_pipe( pipe_slow );
 4226 %}
 4227 
 4228 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4229                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4230   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4231   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4232   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4233   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4234   ins_encode %{
 4235     int vlen_enc = vector_length_encoding(this);
 4236     int vector_len = Matcher::vector_length(this);
 4237     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4238     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4239     __ lea($tmp$$Register, $mem$$Address);
 4240     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4241     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4242     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4243                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4244   %}
 4245   ins_pipe( pipe_slow );
 4246 %}
 4247 
 4248 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4249   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4250   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4251   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4252   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4253   ins_encode %{
 4254     int vlen_enc = vector_length_encoding(this);
 4255     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4256     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4257     __ lea($tmp$$Register, $mem$$Address);
 4258     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4259     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4260                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4261   %}
 4262   ins_pipe( pipe_slow );
 4263 %}
 4264 
 4265 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4266                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4267   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4268   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4269   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4270   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4271   ins_encode %{
 4272     int vlen_enc = vector_length_encoding(this);
 4273     int vector_len = Matcher::vector_length(this);
 4274     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4275     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4276     __ lea($tmp$$Register, $mem$$Address);
 4277     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4278     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4279     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4280                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4281   %}
 4282   ins_pipe( pipe_slow );
 4283 %}
 4284 
 4285 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4286   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4287   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4288   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4289   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4290   ins_encode %{
 4291     int vlen_enc = vector_length_encoding(this);
 4292     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4293     __ lea($tmp$$Register, $mem$$Address);
 4294     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4295     if (elem_bt == T_SHORT) {
 4296       __ movl($mask_idx$$Register, 0x55555555);
 4297       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4298     }
 4299     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4300     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4301   %}
 4302   ins_pipe( pipe_slow );
 4303 %}
 4304 
 4305 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4306                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4307   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4308   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4309   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4310   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4311   ins_encode %{
 4312     int vlen_enc = vector_length_encoding(this);
 4313     int vector_len = Matcher::vector_length(this);
 4314     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4315     __ lea($tmp$$Register, $mem$$Address);
 4316     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4317     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4318     if (elem_bt == T_SHORT) {
 4319       __ movl($mask_idx$$Register, 0x55555555);
 4320       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4321     }
 4322     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4323     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4324                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4325   %}
 4326   ins_pipe( pipe_slow );
 4327 %}
 4328 
 4329 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4330   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4331   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4332   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4333   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4334   ins_encode %{
 4335     int vlen_enc = vector_length_encoding(this);
 4336     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4337     __ lea($tmp$$Register, $mem$$Address);
 4338     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4339     if (elem_bt == T_SHORT) {
 4340       __ movl($mask_idx$$Register, 0x55555555);
 4341       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4342     }
 4343     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4344     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4345                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4346   %}
 4347   ins_pipe( pipe_slow );
 4348 %}
 4349 
 4350 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4351                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4352   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4353   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4354   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4355   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4356   ins_encode %{
 4357     int vlen_enc = vector_length_encoding(this);
 4358     int vector_len = Matcher::vector_length(this);
 4359     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4360     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4361     __ lea($tmp$$Register, $mem$$Address);
 4362     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4363     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4364     if (elem_bt == T_SHORT) {
 4365       __ movl($mask_idx$$Register, 0x55555555);
 4366       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4367     }
 4368     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4369     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4370                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4371   %}
 4372   ins_pipe( pipe_slow );
 4373 %}
 4374 #endif
 4375 
 4376 // ====================Scatter=======================================
 4377 
 4378 // Scatter INT, LONG, FLOAT, DOUBLE
 4379 
 4380 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4381   predicate(UseAVX > 2);
 4382   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4383   effect(TEMP tmp, TEMP ktmp);
 4384   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4385   ins_encode %{
 4386     int vlen_enc = vector_length_encoding(this, $src);
 4387     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4388 
 4389     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4390     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4391 
 4392     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4393     __ lea($tmp$$Register, $mem$$Address);
 4394     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4395   %}
 4396   ins_pipe( pipe_slow );
 4397 %}
 4398 
 4399 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4400   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4401   effect(TEMP tmp, TEMP ktmp);
 4402   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4403   ins_encode %{
 4404     int vlen_enc = vector_length_encoding(this, $src);
 4405     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4406     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4407     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4408     // Note: Since scatter instruction partially updates the opmask register used
 4409     // for predication hense moving mask operand to a temporary.
 4410     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4411     __ lea($tmp$$Register, $mem$$Address);
 4412     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4413   %}
 4414   ins_pipe( pipe_slow );
 4415 %}
 4416 
 4417 // ====================REPLICATE=======================================
 4418 
 4419 // Replicate byte scalar to be vector
 4420 instruct vReplB_reg(vec dst, rRegI src) %{
 4421   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4422   match(Set dst (Replicate src));
 4423   format %{ "replicateB $dst,$src" %}
 4424   ins_encode %{
 4425     uint vlen = Matcher::vector_length(this);
 4426     if (UseAVX >= 2) {
 4427       int vlen_enc = vector_length_encoding(this);
 4428       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4429         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4430         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4431       } else {
 4432         __ movdl($dst$$XMMRegister, $src$$Register);
 4433         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4434       }
 4435     } else {
 4436        assert(UseAVX < 2, "");
 4437       __ movdl($dst$$XMMRegister, $src$$Register);
 4438       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4439       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4440       if (vlen >= 16) {
 4441         assert(vlen == 16, "");
 4442         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4443       }
 4444     }
 4445   %}
 4446   ins_pipe( pipe_slow );
 4447 %}
 4448 
 4449 instruct ReplB_mem(vec dst, memory mem) %{
 4450   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4451   match(Set dst (Replicate (LoadB mem)));
 4452   format %{ "replicateB $dst,$mem" %}
 4453   ins_encode %{
 4454     int vlen_enc = vector_length_encoding(this);
 4455     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4456   %}
 4457   ins_pipe( pipe_slow );
 4458 %}
 4459 
 4460 // ====================ReplicateS=======================================
 4461 
 4462 instruct vReplS_reg(vec dst, rRegI src) %{
 4463   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4464   match(Set dst (Replicate src));
 4465   format %{ "replicateS $dst,$src" %}
 4466   ins_encode %{
 4467     uint vlen = Matcher::vector_length(this);
 4468     int vlen_enc = vector_length_encoding(this);
 4469     if (UseAVX >= 2) {
 4470       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4471         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4472         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4473       } else {
 4474         __ movdl($dst$$XMMRegister, $src$$Register);
 4475         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4476       }
 4477     } else {
 4478       assert(UseAVX < 2, "");
 4479       __ movdl($dst$$XMMRegister, $src$$Register);
 4480       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4481       if (vlen >= 8) {
 4482         assert(vlen == 8, "");
 4483         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4484       }
 4485     }
 4486   %}
 4487   ins_pipe( pipe_slow );
 4488 %}
 4489 
 4490 instruct ReplS_mem(vec dst, memory mem) %{
 4491   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4492   match(Set dst (Replicate (LoadS mem)));
 4493   format %{ "replicateS $dst,$mem" %}
 4494   ins_encode %{
 4495     int vlen_enc = vector_length_encoding(this);
 4496     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4497   %}
 4498   ins_pipe( pipe_slow );
 4499 %}
 4500 
 4501 // ====================ReplicateI=======================================
 4502 
 4503 instruct ReplI_reg(vec dst, rRegI src) %{
 4504   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4505   match(Set dst (Replicate src));
 4506   format %{ "replicateI $dst,$src" %}
 4507   ins_encode %{
 4508     uint vlen = Matcher::vector_length(this);
 4509     int vlen_enc = vector_length_encoding(this);
 4510     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4511       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4512     } else if (VM_Version::supports_avx2()) {
 4513       __ movdl($dst$$XMMRegister, $src$$Register);
 4514       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4515     } else {
 4516       __ movdl($dst$$XMMRegister, $src$$Register);
 4517       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4518     }
 4519   %}
 4520   ins_pipe( pipe_slow );
 4521 %}
 4522 
 4523 instruct ReplI_mem(vec dst, memory mem) %{
 4524   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4525   match(Set dst (Replicate (LoadI mem)));
 4526   format %{ "replicateI $dst,$mem" %}
 4527   ins_encode %{
 4528     int vlen_enc = vector_length_encoding(this);
 4529     if (VM_Version::supports_avx2()) {
 4530       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4531     } else if (VM_Version::supports_avx()) {
 4532       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4533     } else {
 4534       __ movdl($dst$$XMMRegister, $mem$$Address);
 4535       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4536     }
 4537   %}
 4538   ins_pipe( pipe_slow );
 4539 %}
 4540 
 4541 instruct ReplI_imm(vec dst, immI con) %{
 4542   predicate(Matcher::is_non_long_integral_vector(n));
 4543   match(Set dst (Replicate con));
 4544   format %{ "replicateI $dst,$con" %}
 4545   ins_encode %{
 4546     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4547         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4548             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4549                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4550     BasicType bt = Matcher::vector_element_basic_type(this);
 4551     int vlen = Matcher::vector_length_in_bytes(this);
 4552     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4553   %}
 4554   ins_pipe( pipe_slow );
 4555 %}
 4556 
 4557 // Replicate scalar zero to be vector
 4558 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4559   predicate(Matcher::is_non_long_integral_vector(n));
 4560   match(Set dst (Replicate zero));
 4561   format %{ "replicateI $dst,$zero" %}
 4562   ins_encode %{
 4563     int vlen_enc = vector_length_encoding(this);
 4564     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4565       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4566     } else {
 4567       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4568     }
 4569   %}
 4570   ins_pipe( fpu_reg_reg );
 4571 %}
 4572 
 4573 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4574   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4575   match(Set dst (Replicate con));
 4576   format %{ "vallones $dst" %}
 4577   ins_encode %{
 4578     int vector_len = vector_length_encoding(this);
 4579     __ vallones($dst$$XMMRegister, vector_len);
 4580   %}
 4581   ins_pipe( pipe_slow );
 4582 %}
 4583 
 4584 // ====================ReplicateL=======================================
 4585 
 4586 #ifdef _LP64
 4587 // Replicate long (8 byte) scalar to be vector
 4588 instruct ReplL_reg(vec dst, rRegL src) %{
 4589   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4590   match(Set dst (Replicate src));
 4591   format %{ "replicateL $dst,$src" %}
 4592   ins_encode %{
 4593     int vlen = Matcher::vector_length(this);
 4594     int vlen_enc = vector_length_encoding(this);
 4595     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4596       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4597     } else if (VM_Version::supports_avx2()) {
 4598       __ movdq($dst$$XMMRegister, $src$$Register);
 4599       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4600     } else {
 4601       __ movdq($dst$$XMMRegister, $src$$Register);
 4602       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4603     }
 4604   %}
 4605   ins_pipe( pipe_slow );
 4606 %}
 4607 #else // _LP64
 4608 // Replicate long (8 byte) scalar to be vector
 4609 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4610   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4611   match(Set dst (Replicate src));
 4612   effect(TEMP dst, USE src, TEMP tmp);
 4613   format %{ "replicateL $dst,$src" %}
 4614   ins_encode %{
 4615     uint vlen = Matcher::vector_length(this);
 4616     if (vlen == 2) {
 4617       __ movdl($dst$$XMMRegister, $src$$Register);
 4618       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4619       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4620       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4621     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4622       int vlen_enc = Assembler::AVX_256bit;
 4623       __ movdl($dst$$XMMRegister, $src$$Register);
 4624       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4625       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4626       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4627     } else {
 4628       __ movdl($dst$$XMMRegister, $src$$Register);
 4629       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4630       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4631       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4632       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4633     }
 4634   %}
 4635   ins_pipe( pipe_slow );
 4636 %}
 4637 
 4638 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4639   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4640   match(Set dst (Replicate src));
 4641   effect(TEMP dst, USE src, TEMP tmp);
 4642   format %{ "replicateL $dst,$src" %}
 4643   ins_encode %{
 4644     if (VM_Version::supports_avx512vl()) {
 4645       __ movdl($dst$$XMMRegister, $src$$Register);
 4646       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4647       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4648       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4649       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4650       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4651     } else {
 4652       int vlen_enc = Assembler::AVX_512bit;
 4653       __ movdl($dst$$XMMRegister, $src$$Register);
 4654       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4655       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4656       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4657     }
 4658   %}
 4659   ins_pipe( pipe_slow );
 4660 %}
 4661 #endif // _LP64
 4662 
 4663 instruct ReplL_mem(vec dst, memory mem) %{
 4664   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4665   match(Set dst (Replicate (LoadL mem)));
 4666   format %{ "replicateL $dst,$mem" %}
 4667   ins_encode %{
 4668     int vlen_enc = vector_length_encoding(this);
 4669     if (VM_Version::supports_avx2()) {
 4670       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4671     } else if (VM_Version::supports_sse3()) {
 4672       __ movddup($dst$$XMMRegister, $mem$$Address);
 4673     } else {
 4674       __ movq($dst$$XMMRegister, $mem$$Address);
 4675       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4676     }
 4677   %}
 4678   ins_pipe( pipe_slow );
 4679 %}
 4680 
 4681 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4682 instruct ReplL_imm(vec dst, immL con) %{
 4683   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4684   match(Set dst (Replicate con));
 4685   format %{ "replicateL $dst,$con" %}
 4686   ins_encode %{
 4687     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4688     int vlen = Matcher::vector_length_in_bytes(this);
 4689     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4690   %}
 4691   ins_pipe( pipe_slow );
 4692 %}
 4693 
 4694 instruct ReplL_zero(vec dst, immL0 zero) %{
 4695   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4696   match(Set dst (Replicate zero));
 4697   format %{ "replicateL $dst,$zero" %}
 4698   ins_encode %{
 4699     int vlen_enc = vector_length_encoding(this);
 4700     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4701       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4702     } else {
 4703       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4704     }
 4705   %}
 4706   ins_pipe( fpu_reg_reg );
 4707 %}
 4708 
 4709 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4710   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4711   match(Set dst (Replicate con));
 4712   format %{ "vallones $dst" %}
 4713   ins_encode %{
 4714     int vector_len = vector_length_encoding(this);
 4715     __ vallones($dst$$XMMRegister, vector_len);
 4716   %}
 4717   ins_pipe( pipe_slow );
 4718 %}
 4719 
 4720 // ====================ReplicateF=======================================
 4721 
 4722 instruct vReplF_reg(vec dst, vlRegF src) %{
 4723   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4724   match(Set dst (Replicate src));
 4725   format %{ "replicateF $dst,$src" %}
 4726   ins_encode %{
 4727     uint vlen = Matcher::vector_length(this);
 4728     int vlen_enc = vector_length_encoding(this);
 4729     if (vlen <= 4) {
 4730       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4731     } else if (VM_Version::supports_avx2()) {
 4732       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4733     } else {
 4734       assert(vlen == 8, "sanity");
 4735       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4736       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4737     }
 4738   %}
 4739   ins_pipe( pipe_slow );
 4740 %}
 4741 
 4742 instruct ReplF_reg(vec dst, vlRegF src) %{
 4743   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4744   match(Set dst (Replicate src));
 4745   format %{ "replicateF $dst,$src" %}
 4746   ins_encode %{
 4747     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4748   %}
 4749   ins_pipe( pipe_slow );
 4750 %}
 4751 
 4752 instruct ReplF_mem(vec dst, memory mem) %{
 4753   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4754   match(Set dst (Replicate (LoadF mem)));
 4755   format %{ "replicateF $dst,$mem" %}
 4756   ins_encode %{
 4757     int vlen_enc = vector_length_encoding(this);
 4758     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4759   %}
 4760   ins_pipe( pipe_slow );
 4761 %}
 4762 
 4763 // Replicate float scalar immediate to be vector by loading from const table.
 4764 instruct ReplF_imm(vec dst, immF con) %{
 4765   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4766   match(Set dst (Replicate con));
 4767   format %{ "replicateF $dst,$con" %}
 4768   ins_encode %{
 4769     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4770         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4771     int vlen = Matcher::vector_length_in_bytes(this);
 4772     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4773   %}
 4774   ins_pipe( pipe_slow );
 4775 %}
 4776 
 4777 instruct ReplF_zero(vec dst, immF0 zero) %{
 4778   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4779   match(Set dst (Replicate zero));
 4780   format %{ "replicateF $dst,$zero" %}
 4781   ins_encode %{
 4782     int vlen_enc = vector_length_encoding(this);
 4783     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4784       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4785     } else {
 4786       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4787     }
 4788   %}
 4789   ins_pipe( fpu_reg_reg );
 4790 %}
 4791 
 4792 // ====================ReplicateD=======================================
 4793 
 4794 // Replicate double (8 bytes) scalar to be vector
 4795 instruct vReplD_reg(vec dst, vlRegD src) %{
 4796   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4797   match(Set dst (Replicate src));
 4798   format %{ "replicateD $dst,$src" %}
 4799   ins_encode %{
 4800     uint vlen = Matcher::vector_length(this);
 4801     int vlen_enc = vector_length_encoding(this);
 4802     if (vlen <= 2) {
 4803       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4804     } else if (VM_Version::supports_avx2()) {
 4805       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4806     } else {
 4807       assert(vlen == 4, "sanity");
 4808       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4809       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4810     }
 4811   %}
 4812   ins_pipe( pipe_slow );
 4813 %}
 4814 
 4815 instruct ReplD_reg(vec dst, vlRegD src) %{
 4816   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4817   match(Set dst (Replicate src));
 4818   format %{ "replicateD $dst,$src" %}
 4819   ins_encode %{
 4820     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4821   %}
 4822   ins_pipe( pipe_slow );
 4823 %}
 4824 
 4825 instruct ReplD_mem(vec dst, memory mem) %{
 4826   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4827   match(Set dst (Replicate (LoadD mem)));
 4828   format %{ "replicateD $dst,$mem" %}
 4829   ins_encode %{
 4830     if (Matcher::vector_length(this) >= 4) {
 4831       int vlen_enc = vector_length_encoding(this);
 4832       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4833     } else {
 4834       __ movddup($dst$$XMMRegister, $mem$$Address);
 4835     }
 4836   %}
 4837   ins_pipe( pipe_slow );
 4838 %}
 4839 
 4840 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4841 instruct ReplD_imm(vec dst, immD con) %{
 4842   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4843   match(Set dst (Replicate con));
 4844   format %{ "replicateD $dst,$con" %}
 4845   ins_encode %{
 4846     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4847     int vlen = Matcher::vector_length_in_bytes(this);
 4848     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4849   %}
 4850   ins_pipe( pipe_slow );
 4851 %}
 4852 
 4853 instruct ReplD_zero(vec dst, immD0 zero) %{
 4854   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4855   match(Set dst (Replicate zero));
 4856   format %{ "replicateD $dst,$zero" %}
 4857   ins_encode %{
 4858     int vlen_enc = vector_length_encoding(this);
 4859     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4860       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4861     } else {
 4862       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4863     }
 4864   %}
 4865   ins_pipe( fpu_reg_reg );
 4866 %}
 4867 
 4868 // ====================VECTOR INSERT=======================================
 4869 
 4870 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4871   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4872   match(Set dst (VectorInsert (Binary dst val) idx));
 4873   format %{ "vector_insert $dst,$val,$idx" %}
 4874   ins_encode %{
 4875     assert(UseSSE >= 4, "required");
 4876     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4877 
 4878     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4879 
 4880     assert(is_integral_type(elem_bt), "");
 4881     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4882 
 4883     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4884   %}
 4885   ins_pipe( pipe_slow );
 4886 %}
 4887 
 4888 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4889   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4890   match(Set dst (VectorInsert (Binary src val) idx));
 4891   effect(TEMP vtmp);
 4892   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4893   ins_encode %{
 4894     int vlen_enc = Assembler::AVX_256bit;
 4895     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4896     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4897     int log2epr = log2(elem_per_lane);
 4898 
 4899     assert(is_integral_type(elem_bt), "sanity");
 4900     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4901 
 4902     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4903     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4904     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4905     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4906     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4907   %}
 4908   ins_pipe( pipe_slow );
 4909 %}
 4910 
 4911 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4912   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4913   match(Set dst (VectorInsert (Binary src val) idx));
 4914   effect(TEMP vtmp);
 4915   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4916   ins_encode %{
 4917     assert(UseAVX > 2, "sanity");
 4918 
 4919     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4920     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4921     int log2epr = log2(elem_per_lane);
 4922 
 4923     assert(is_integral_type(elem_bt), "");
 4924     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4925 
 4926     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4927     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4928     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4929     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4930     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4931   %}
 4932   ins_pipe( pipe_slow );
 4933 %}
 4934 
 4935 #ifdef _LP64
 4936 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4937   predicate(Matcher::vector_length(n) == 2);
 4938   match(Set dst (VectorInsert (Binary dst val) idx));
 4939   format %{ "vector_insert $dst,$val,$idx" %}
 4940   ins_encode %{
 4941     assert(UseSSE >= 4, "required");
 4942     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4943     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4944 
 4945     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4946   %}
 4947   ins_pipe( pipe_slow );
 4948 %}
 4949 
 4950 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4951   predicate(Matcher::vector_length(n) == 4);
 4952   match(Set dst (VectorInsert (Binary src val) idx));
 4953   effect(TEMP vtmp);
 4954   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4955   ins_encode %{
 4956     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4957     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4958 
 4959     uint x_idx = $idx$$constant & right_n_bits(1);
 4960     uint y_idx = ($idx$$constant >> 1) & 1;
 4961     int vlen_enc = Assembler::AVX_256bit;
 4962     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4963     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4964     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4965   %}
 4966   ins_pipe( pipe_slow );
 4967 %}
 4968 
 4969 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4970   predicate(Matcher::vector_length(n) == 8);
 4971   match(Set dst (VectorInsert (Binary src val) idx));
 4972   effect(TEMP vtmp);
 4973   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4974   ins_encode %{
 4975     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4976     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4977 
 4978     uint x_idx = $idx$$constant & right_n_bits(1);
 4979     uint y_idx = ($idx$$constant >> 1) & 3;
 4980     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4981     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4982     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4983   %}
 4984   ins_pipe( pipe_slow );
 4985 %}
 4986 #endif
 4987 
 4988 instruct insertF(vec dst, regF val, immU8 idx) %{
 4989   predicate(Matcher::vector_length(n) < 8);
 4990   match(Set dst (VectorInsert (Binary dst val) idx));
 4991   format %{ "vector_insert $dst,$val,$idx" %}
 4992   ins_encode %{
 4993     assert(UseSSE >= 4, "sanity");
 4994 
 4995     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4996     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4997 
 4998     uint x_idx = $idx$$constant & right_n_bits(2);
 4999     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5000   %}
 5001   ins_pipe( pipe_slow );
 5002 %}
 5003 
 5004 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 5005   predicate(Matcher::vector_length(n) >= 8);
 5006   match(Set dst (VectorInsert (Binary src val) idx));
 5007   effect(TEMP vtmp);
 5008   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5009   ins_encode %{
 5010     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5011     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5012 
 5013     int vlen = Matcher::vector_length(this);
 5014     uint x_idx = $idx$$constant & right_n_bits(2);
 5015     if (vlen == 8) {
 5016       uint y_idx = ($idx$$constant >> 2) & 1;
 5017       int vlen_enc = Assembler::AVX_256bit;
 5018       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5019       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5020       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5021     } else {
 5022       assert(vlen == 16, "sanity");
 5023       uint y_idx = ($idx$$constant >> 2) & 3;
 5024       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5025       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5026       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5027     }
 5028   %}
 5029   ins_pipe( pipe_slow );
 5030 %}
 5031 
 5032 #ifdef _LP64
 5033 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 5034   predicate(Matcher::vector_length(n) == 2);
 5035   match(Set dst (VectorInsert (Binary dst val) idx));
 5036   effect(TEMP tmp);
 5037   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 5038   ins_encode %{
 5039     assert(UseSSE >= 4, "sanity");
 5040     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5041     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5042 
 5043     __ movq($tmp$$Register, $val$$XMMRegister);
 5044     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 5045   %}
 5046   ins_pipe( pipe_slow );
 5047 %}
 5048 
 5049 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 5050   predicate(Matcher::vector_length(n) == 4);
 5051   match(Set dst (VectorInsert (Binary src val) idx));
 5052   effect(TEMP vtmp, TEMP tmp);
 5053   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 5054   ins_encode %{
 5055     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5056     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5057 
 5058     uint x_idx = $idx$$constant & right_n_bits(1);
 5059     uint y_idx = ($idx$$constant >> 1) & 1;
 5060     int vlen_enc = Assembler::AVX_256bit;
 5061     __ movq($tmp$$Register, $val$$XMMRegister);
 5062     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5063     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5064     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5065   %}
 5066   ins_pipe( pipe_slow );
 5067 %}
 5068 
 5069 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 5070   predicate(Matcher::vector_length(n) == 8);
 5071   match(Set dst (VectorInsert (Binary src val) idx));
 5072   effect(TEMP tmp, TEMP vtmp);
 5073   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5074   ins_encode %{
 5075     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5076     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5077 
 5078     uint x_idx = $idx$$constant & right_n_bits(1);
 5079     uint y_idx = ($idx$$constant >> 1) & 3;
 5080     __ movq($tmp$$Register, $val$$XMMRegister);
 5081     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5082     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5083     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5084   %}
 5085   ins_pipe( pipe_slow );
 5086 %}
 5087 #endif
 5088 
 5089 // ====================REDUCTION ARITHMETIC=======================================
 5090 
 5091 // =======================Int Reduction==========================================
 5092 
 5093 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5094   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 5095   match(Set dst (AddReductionVI src1 src2));
 5096   match(Set dst (MulReductionVI src1 src2));
 5097   match(Set dst (AndReductionV  src1 src2));
 5098   match(Set dst ( OrReductionV  src1 src2));
 5099   match(Set dst (XorReductionV  src1 src2));
 5100   match(Set dst (MinReductionV  src1 src2));
 5101   match(Set dst (MaxReductionV  src1 src2));
 5102   effect(TEMP vtmp1, TEMP vtmp2);
 5103   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5104   ins_encode %{
 5105     int opcode = this->ideal_Opcode();
 5106     int vlen = Matcher::vector_length(this, $src2);
 5107     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5108   %}
 5109   ins_pipe( pipe_slow );
 5110 %}
 5111 
 5112 // =======================Long Reduction==========================================
 5113 
 5114 #ifdef _LP64
 5115 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5116   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 5117   match(Set dst (AddReductionVL src1 src2));
 5118   match(Set dst (MulReductionVL src1 src2));
 5119   match(Set dst (AndReductionV  src1 src2));
 5120   match(Set dst ( OrReductionV  src1 src2));
 5121   match(Set dst (XorReductionV  src1 src2));
 5122   match(Set dst (MinReductionV  src1 src2));
 5123   match(Set dst (MaxReductionV  src1 src2));
 5124   effect(TEMP vtmp1, TEMP vtmp2);
 5125   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5126   ins_encode %{
 5127     int opcode = this->ideal_Opcode();
 5128     int vlen = Matcher::vector_length(this, $src2);
 5129     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5130   %}
 5131   ins_pipe( pipe_slow );
 5132 %}
 5133 
 5134 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5135   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5136   match(Set dst (AddReductionVL src1 src2));
 5137   match(Set dst (MulReductionVL src1 src2));
 5138   match(Set dst (AndReductionV  src1 src2));
 5139   match(Set dst ( OrReductionV  src1 src2));
 5140   match(Set dst (XorReductionV  src1 src2));
 5141   match(Set dst (MinReductionV  src1 src2));
 5142   match(Set dst (MaxReductionV  src1 src2));
 5143   effect(TEMP vtmp1, TEMP vtmp2);
 5144   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5145   ins_encode %{
 5146     int opcode = this->ideal_Opcode();
 5147     int vlen = Matcher::vector_length(this, $src2);
 5148     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5149   %}
 5150   ins_pipe( pipe_slow );
 5151 %}
 5152 #endif // _LP64
 5153 
 5154 // =======================Float Reduction==========================================
 5155 
 5156 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5157   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5158   match(Set dst (AddReductionVF dst src));
 5159   match(Set dst (MulReductionVF dst src));
 5160   effect(TEMP dst, TEMP vtmp);
 5161   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5162   ins_encode %{
 5163     int opcode = this->ideal_Opcode();
 5164     int vlen = Matcher::vector_length(this, $src);
 5165     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5166   %}
 5167   ins_pipe( pipe_slow );
 5168 %}
 5169 
 5170 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5171   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5172   match(Set dst (AddReductionVF dst src));
 5173   match(Set dst (MulReductionVF dst src));
 5174   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5175   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5176   ins_encode %{
 5177     int opcode = this->ideal_Opcode();
 5178     int vlen = Matcher::vector_length(this, $src);
 5179     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5180   %}
 5181   ins_pipe( pipe_slow );
 5182 %}
 5183 
 5184 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5185   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5186   match(Set dst (AddReductionVF dst src));
 5187   match(Set dst (MulReductionVF dst src));
 5188   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5189   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5190   ins_encode %{
 5191     int opcode = this->ideal_Opcode();
 5192     int vlen = Matcher::vector_length(this, $src);
 5193     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5194   %}
 5195   ins_pipe( pipe_slow );
 5196 %}
 5197 
 5198 
 5199 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5200   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5201   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5202   // src1 contains reduction identity
 5203   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5204   match(Set dst (AddReductionVF src1 src2));
 5205   match(Set dst (MulReductionVF src1 src2));
 5206   effect(TEMP dst);
 5207   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5208   ins_encode %{
 5209     int opcode = this->ideal_Opcode();
 5210     int vlen = Matcher::vector_length(this, $src2);
 5211     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5212   %}
 5213   ins_pipe( pipe_slow );
 5214 %}
 5215 
 5216 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5217   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5218   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5219   // src1 contains reduction identity
 5220   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5221   match(Set dst (AddReductionVF src1 src2));
 5222   match(Set dst (MulReductionVF src1 src2));
 5223   effect(TEMP dst, TEMP vtmp);
 5224   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5225   ins_encode %{
 5226     int opcode = this->ideal_Opcode();
 5227     int vlen = Matcher::vector_length(this, $src2);
 5228     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5229   %}
 5230   ins_pipe( pipe_slow );
 5231 %}
 5232 
 5233 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5234   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5235   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5236   // src1 contains reduction identity
 5237   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5238   match(Set dst (AddReductionVF src1 src2));
 5239   match(Set dst (MulReductionVF src1 src2));
 5240   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5241   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5242   ins_encode %{
 5243     int opcode = this->ideal_Opcode();
 5244     int vlen = Matcher::vector_length(this, $src2);
 5245     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5246   %}
 5247   ins_pipe( pipe_slow );
 5248 %}
 5249 
 5250 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5251   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5252   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5253   // src1 contains reduction identity
 5254   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5255   match(Set dst (AddReductionVF src1 src2));
 5256   match(Set dst (MulReductionVF src1 src2));
 5257   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5258   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5259   ins_encode %{
 5260     int opcode = this->ideal_Opcode();
 5261     int vlen = Matcher::vector_length(this, $src2);
 5262     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5263   %}
 5264   ins_pipe( pipe_slow );
 5265 %}
 5266 
 5267 // =======================Double Reduction==========================================
 5268 
 5269 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5270   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5271   match(Set dst (AddReductionVD dst src));
 5272   match(Set dst (MulReductionVD dst src));
 5273   effect(TEMP dst, TEMP vtmp);
 5274   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5275   ins_encode %{
 5276     int opcode = this->ideal_Opcode();
 5277     int vlen = Matcher::vector_length(this, $src);
 5278     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5279 %}
 5280   ins_pipe( pipe_slow );
 5281 %}
 5282 
 5283 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5284   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5285   match(Set dst (AddReductionVD dst src));
 5286   match(Set dst (MulReductionVD dst src));
 5287   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5288   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5289   ins_encode %{
 5290     int opcode = this->ideal_Opcode();
 5291     int vlen = Matcher::vector_length(this, $src);
 5292     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5293   %}
 5294   ins_pipe( pipe_slow );
 5295 %}
 5296 
 5297 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5298   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5299   match(Set dst (AddReductionVD dst src));
 5300   match(Set dst (MulReductionVD dst src));
 5301   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5302   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5303   ins_encode %{
 5304     int opcode = this->ideal_Opcode();
 5305     int vlen = Matcher::vector_length(this, $src);
 5306     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5307   %}
 5308   ins_pipe( pipe_slow );
 5309 %}
 5310 
 5311 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5312   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5313   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5314   // src1 contains reduction identity
 5315   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5316   match(Set dst (AddReductionVD src1 src2));
 5317   match(Set dst (MulReductionVD src1 src2));
 5318   effect(TEMP dst);
 5319   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5320   ins_encode %{
 5321     int opcode = this->ideal_Opcode();
 5322     int vlen = Matcher::vector_length(this, $src2);
 5323     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5324 %}
 5325   ins_pipe( pipe_slow );
 5326 %}
 5327 
 5328 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5329   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5330   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5331   // src1 contains reduction identity
 5332   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5333   match(Set dst (AddReductionVD src1 src2));
 5334   match(Set dst (MulReductionVD src1 src2));
 5335   effect(TEMP dst, TEMP vtmp);
 5336   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5337   ins_encode %{
 5338     int opcode = this->ideal_Opcode();
 5339     int vlen = Matcher::vector_length(this, $src2);
 5340     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5341   %}
 5342   ins_pipe( pipe_slow );
 5343 %}
 5344 
 5345 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5346   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5347   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5348   // src1 contains reduction identity
 5349   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5350   match(Set dst (AddReductionVD src1 src2));
 5351   match(Set dst (MulReductionVD src1 src2));
 5352   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5353   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5354   ins_encode %{
 5355     int opcode = this->ideal_Opcode();
 5356     int vlen = Matcher::vector_length(this, $src2);
 5357     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5358   %}
 5359   ins_pipe( pipe_slow );
 5360 %}
 5361 
 5362 // =======================Byte Reduction==========================================
 5363 
 5364 #ifdef _LP64
 5365 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5366   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5367   match(Set dst (AddReductionVI src1 src2));
 5368   match(Set dst (AndReductionV  src1 src2));
 5369   match(Set dst ( OrReductionV  src1 src2));
 5370   match(Set dst (XorReductionV  src1 src2));
 5371   match(Set dst (MinReductionV  src1 src2));
 5372   match(Set dst (MaxReductionV  src1 src2));
 5373   effect(TEMP vtmp1, TEMP vtmp2);
 5374   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5375   ins_encode %{
 5376     int opcode = this->ideal_Opcode();
 5377     int vlen = Matcher::vector_length(this, $src2);
 5378     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5379   %}
 5380   ins_pipe( pipe_slow );
 5381 %}
 5382 
 5383 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5384   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5385   match(Set dst (AddReductionVI src1 src2));
 5386   match(Set dst (AndReductionV  src1 src2));
 5387   match(Set dst ( OrReductionV  src1 src2));
 5388   match(Set dst (XorReductionV  src1 src2));
 5389   match(Set dst (MinReductionV  src1 src2));
 5390   match(Set dst (MaxReductionV  src1 src2));
 5391   effect(TEMP vtmp1, TEMP vtmp2);
 5392   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5393   ins_encode %{
 5394     int opcode = this->ideal_Opcode();
 5395     int vlen = Matcher::vector_length(this, $src2);
 5396     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5397   %}
 5398   ins_pipe( pipe_slow );
 5399 %}
 5400 #endif
 5401 
 5402 // =======================Short Reduction==========================================
 5403 
 5404 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5405   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5406   match(Set dst (AddReductionVI src1 src2));
 5407   match(Set dst (MulReductionVI src1 src2));
 5408   match(Set dst (AndReductionV  src1 src2));
 5409   match(Set dst ( OrReductionV  src1 src2));
 5410   match(Set dst (XorReductionV  src1 src2));
 5411   match(Set dst (MinReductionV  src1 src2));
 5412   match(Set dst (MaxReductionV  src1 src2));
 5413   effect(TEMP vtmp1, TEMP vtmp2);
 5414   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5415   ins_encode %{
 5416     int opcode = this->ideal_Opcode();
 5417     int vlen = Matcher::vector_length(this, $src2);
 5418     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5419   %}
 5420   ins_pipe( pipe_slow );
 5421 %}
 5422 
 5423 // =======================Mul Reduction==========================================
 5424 
 5425 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5426   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5427             Matcher::vector_length(n->in(2)) <= 32); // src2
 5428   match(Set dst (MulReductionVI src1 src2));
 5429   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5430   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5431   ins_encode %{
 5432     int opcode = this->ideal_Opcode();
 5433     int vlen = Matcher::vector_length(this, $src2);
 5434     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5435   %}
 5436   ins_pipe( pipe_slow );
 5437 %}
 5438 
 5439 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5440   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5441             Matcher::vector_length(n->in(2)) == 64); // src2
 5442   match(Set dst (MulReductionVI src1 src2));
 5443   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5444   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5445   ins_encode %{
 5446     int opcode = this->ideal_Opcode();
 5447     int vlen = Matcher::vector_length(this, $src2);
 5448     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5449   %}
 5450   ins_pipe( pipe_slow );
 5451 %}
 5452 
 5453 //--------------------Min/Max Float Reduction --------------------
 5454 // Float Min Reduction
 5455 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5456                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5457   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5458             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5459              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5460             Matcher::vector_length(n->in(2)) == 2);
 5461   match(Set dst (MinReductionV src1 src2));
 5462   match(Set dst (MaxReductionV src1 src2));
 5463   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5464   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5465   ins_encode %{
 5466     assert(UseAVX > 0, "sanity");
 5467 
 5468     int opcode = this->ideal_Opcode();
 5469     int vlen = Matcher::vector_length(this, $src2);
 5470     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5471                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5472   %}
 5473   ins_pipe( pipe_slow );
 5474 %}
 5475 
 5476 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5477                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5478   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5479             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5480              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5481             Matcher::vector_length(n->in(2)) >= 4);
 5482   match(Set dst (MinReductionV src1 src2));
 5483   match(Set dst (MaxReductionV src1 src2));
 5484   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5485   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5486   ins_encode %{
 5487     assert(UseAVX > 0, "sanity");
 5488 
 5489     int opcode = this->ideal_Opcode();
 5490     int vlen = Matcher::vector_length(this, $src2);
 5491     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5492                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5493   %}
 5494   ins_pipe( pipe_slow );
 5495 %}
 5496 
 5497 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5498                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5499   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5500             Matcher::vector_length(n->in(2)) == 2);
 5501   match(Set dst (MinReductionV dst src));
 5502   match(Set dst (MaxReductionV dst src));
 5503   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5504   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5505   ins_encode %{
 5506     assert(UseAVX > 0, "sanity");
 5507 
 5508     int opcode = this->ideal_Opcode();
 5509     int vlen = Matcher::vector_length(this, $src);
 5510     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5511                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5512   %}
 5513   ins_pipe( pipe_slow );
 5514 %}
 5515 
 5516 
 5517 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5518                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5519   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5520             Matcher::vector_length(n->in(2)) >= 4);
 5521   match(Set dst (MinReductionV dst src));
 5522   match(Set dst (MaxReductionV dst src));
 5523   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5524   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5525   ins_encode %{
 5526     assert(UseAVX > 0, "sanity");
 5527 
 5528     int opcode = this->ideal_Opcode();
 5529     int vlen = Matcher::vector_length(this, $src);
 5530     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5531                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5532   %}
 5533   ins_pipe( pipe_slow );
 5534 %}
 5535 
 5536 
 5537 //--------------------Min Double Reduction --------------------
 5538 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5539                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5540                             rFlagsReg cr) %{
 5541   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5542             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5543              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5544             Matcher::vector_length(n->in(2)) == 2);
 5545   match(Set dst (MinReductionV src1 src2));
 5546   match(Set dst (MaxReductionV src1 src2));
 5547   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5548   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5549   ins_encode %{
 5550     assert(UseAVX > 0, "sanity");
 5551 
 5552     int opcode = this->ideal_Opcode();
 5553     int vlen = Matcher::vector_length(this, $src2);
 5554     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5555                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5556   %}
 5557   ins_pipe( pipe_slow );
 5558 %}
 5559 
 5560 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5561                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5562                            rFlagsReg cr) %{
 5563   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5564             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5565              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5566             Matcher::vector_length(n->in(2)) >= 4);
 5567   match(Set dst (MinReductionV src1 src2));
 5568   match(Set dst (MaxReductionV src1 src2));
 5569   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5570   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5571   ins_encode %{
 5572     assert(UseAVX > 0, "sanity");
 5573 
 5574     int opcode = this->ideal_Opcode();
 5575     int vlen = Matcher::vector_length(this, $src2);
 5576     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5577                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5578   %}
 5579   ins_pipe( pipe_slow );
 5580 %}
 5581 
 5582 
 5583 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5584                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5585                                rFlagsReg cr) %{
 5586   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5587             Matcher::vector_length(n->in(2)) == 2);
 5588   match(Set dst (MinReductionV dst src));
 5589   match(Set dst (MaxReductionV dst src));
 5590   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5591   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5592   ins_encode %{
 5593     assert(UseAVX > 0, "sanity");
 5594 
 5595     int opcode = this->ideal_Opcode();
 5596     int vlen = Matcher::vector_length(this, $src);
 5597     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5598                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5599   %}
 5600   ins_pipe( pipe_slow );
 5601 %}
 5602 
 5603 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5604                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5605                               rFlagsReg cr) %{
 5606   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5607             Matcher::vector_length(n->in(2)) >= 4);
 5608   match(Set dst (MinReductionV dst src));
 5609   match(Set dst (MaxReductionV dst src));
 5610   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5611   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5612   ins_encode %{
 5613     assert(UseAVX > 0, "sanity");
 5614 
 5615     int opcode = this->ideal_Opcode();
 5616     int vlen = Matcher::vector_length(this, $src);
 5617     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5618                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5619   %}
 5620   ins_pipe( pipe_slow );
 5621 %}
 5622 
 5623 // ====================VECTOR ARITHMETIC=======================================
 5624 
 5625 // --------------------------------- ADD --------------------------------------
 5626 
 5627 // Bytes vector add
 5628 instruct vaddB(vec dst, vec src) %{
 5629   predicate(UseAVX == 0);
 5630   match(Set dst (AddVB dst src));
 5631   format %{ "paddb   $dst,$src\t! add packedB" %}
 5632   ins_encode %{
 5633     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5634   %}
 5635   ins_pipe( pipe_slow );
 5636 %}
 5637 
 5638 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5639   predicate(UseAVX > 0);
 5640   match(Set dst (AddVB src1 src2));
 5641   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5642   ins_encode %{
 5643     int vlen_enc = vector_length_encoding(this);
 5644     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5645   %}
 5646   ins_pipe( pipe_slow );
 5647 %}
 5648 
 5649 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5650   predicate((UseAVX > 0) &&
 5651             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5652   match(Set dst (AddVB src (LoadVector mem)));
 5653   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5654   ins_encode %{
 5655     int vlen_enc = vector_length_encoding(this);
 5656     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5657   %}
 5658   ins_pipe( pipe_slow );
 5659 %}
 5660 
 5661 // Shorts/Chars vector add
 5662 instruct vaddS(vec dst, vec src) %{
 5663   predicate(UseAVX == 0);
 5664   match(Set dst (AddVS dst src));
 5665   format %{ "paddw   $dst,$src\t! add packedS" %}
 5666   ins_encode %{
 5667     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5668   %}
 5669   ins_pipe( pipe_slow );
 5670 %}
 5671 
 5672 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5673   predicate(UseAVX > 0);
 5674   match(Set dst (AddVS src1 src2));
 5675   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5676   ins_encode %{
 5677     int vlen_enc = vector_length_encoding(this);
 5678     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5679   %}
 5680   ins_pipe( pipe_slow );
 5681 %}
 5682 
 5683 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5684   predicate((UseAVX > 0) &&
 5685             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5686   match(Set dst (AddVS src (LoadVector mem)));
 5687   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5688   ins_encode %{
 5689     int vlen_enc = vector_length_encoding(this);
 5690     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5691   %}
 5692   ins_pipe( pipe_slow );
 5693 %}
 5694 
 5695 // Integers vector add
 5696 instruct vaddI(vec dst, vec src) %{
 5697   predicate(UseAVX == 0);
 5698   match(Set dst (AddVI dst src));
 5699   format %{ "paddd   $dst,$src\t! add packedI" %}
 5700   ins_encode %{
 5701     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5702   %}
 5703   ins_pipe( pipe_slow );
 5704 %}
 5705 
 5706 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5707   predicate(UseAVX > 0);
 5708   match(Set dst (AddVI src1 src2));
 5709   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5710   ins_encode %{
 5711     int vlen_enc = vector_length_encoding(this);
 5712     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5713   %}
 5714   ins_pipe( pipe_slow );
 5715 %}
 5716 
 5717 
 5718 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5719   predicate((UseAVX > 0) &&
 5720             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5721   match(Set dst (AddVI src (LoadVector mem)));
 5722   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5723   ins_encode %{
 5724     int vlen_enc = vector_length_encoding(this);
 5725     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5726   %}
 5727   ins_pipe( pipe_slow );
 5728 %}
 5729 
 5730 // Longs vector add
 5731 instruct vaddL(vec dst, vec src) %{
 5732   predicate(UseAVX == 0);
 5733   match(Set dst (AddVL dst src));
 5734   format %{ "paddq   $dst,$src\t! add packedL" %}
 5735   ins_encode %{
 5736     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5737   %}
 5738   ins_pipe( pipe_slow );
 5739 %}
 5740 
 5741 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5742   predicate(UseAVX > 0);
 5743   match(Set dst (AddVL src1 src2));
 5744   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5745   ins_encode %{
 5746     int vlen_enc = vector_length_encoding(this);
 5747     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5748   %}
 5749   ins_pipe( pipe_slow );
 5750 %}
 5751 
 5752 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5753   predicate((UseAVX > 0) &&
 5754             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5755   match(Set dst (AddVL src (LoadVector mem)));
 5756   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5757   ins_encode %{
 5758     int vlen_enc = vector_length_encoding(this);
 5759     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5760   %}
 5761   ins_pipe( pipe_slow );
 5762 %}
 5763 
 5764 // Floats vector add
 5765 instruct vaddF(vec dst, vec src) %{
 5766   predicate(UseAVX == 0);
 5767   match(Set dst (AddVF dst src));
 5768   format %{ "addps   $dst,$src\t! add packedF" %}
 5769   ins_encode %{
 5770     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5771   %}
 5772   ins_pipe( pipe_slow );
 5773 %}
 5774 
 5775 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5776   predicate(UseAVX > 0);
 5777   match(Set dst (AddVF src1 src2));
 5778   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5779   ins_encode %{
 5780     int vlen_enc = vector_length_encoding(this);
 5781     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5782   %}
 5783   ins_pipe( pipe_slow );
 5784 %}
 5785 
 5786 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5787   predicate((UseAVX > 0) &&
 5788             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5789   match(Set dst (AddVF src (LoadVector mem)));
 5790   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5791   ins_encode %{
 5792     int vlen_enc = vector_length_encoding(this);
 5793     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5794   %}
 5795   ins_pipe( pipe_slow );
 5796 %}
 5797 
 5798 // Doubles vector add
 5799 instruct vaddD(vec dst, vec src) %{
 5800   predicate(UseAVX == 0);
 5801   match(Set dst (AddVD dst src));
 5802   format %{ "addpd   $dst,$src\t! add packedD" %}
 5803   ins_encode %{
 5804     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5805   %}
 5806   ins_pipe( pipe_slow );
 5807 %}
 5808 
 5809 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5810   predicate(UseAVX > 0);
 5811   match(Set dst (AddVD src1 src2));
 5812   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5813   ins_encode %{
 5814     int vlen_enc = vector_length_encoding(this);
 5815     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5816   %}
 5817   ins_pipe( pipe_slow );
 5818 %}
 5819 
 5820 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5821   predicate((UseAVX > 0) &&
 5822             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5823   match(Set dst (AddVD src (LoadVector mem)));
 5824   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5825   ins_encode %{
 5826     int vlen_enc = vector_length_encoding(this);
 5827     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5828   %}
 5829   ins_pipe( pipe_slow );
 5830 %}
 5831 
 5832 // --------------------------------- SUB --------------------------------------
 5833 
 5834 // Bytes vector sub
 5835 instruct vsubB(vec dst, vec src) %{
 5836   predicate(UseAVX == 0);
 5837   match(Set dst (SubVB dst src));
 5838   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5839   ins_encode %{
 5840     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5841   %}
 5842   ins_pipe( pipe_slow );
 5843 %}
 5844 
 5845 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5846   predicate(UseAVX > 0);
 5847   match(Set dst (SubVB src1 src2));
 5848   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5849   ins_encode %{
 5850     int vlen_enc = vector_length_encoding(this);
 5851     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5852   %}
 5853   ins_pipe( pipe_slow );
 5854 %}
 5855 
 5856 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5857   predicate((UseAVX > 0) &&
 5858             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5859   match(Set dst (SubVB src (LoadVector mem)));
 5860   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5861   ins_encode %{
 5862     int vlen_enc = vector_length_encoding(this);
 5863     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5864   %}
 5865   ins_pipe( pipe_slow );
 5866 %}
 5867 
 5868 // Shorts/Chars vector sub
 5869 instruct vsubS(vec dst, vec src) %{
 5870   predicate(UseAVX == 0);
 5871   match(Set dst (SubVS dst src));
 5872   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5873   ins_encode %{
 5874     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5875   %}
 5876   ins_pipe( pipe_slow );
 5877 %}
 5878 
 5879 
 5880 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5881   predicate(UseAVX > 0);
 5882   match(Set dst (SubVS src1 src2));
 5883   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5884   ins_encode %{
 5885     int vlen_enc = vector_length_encoding(this);
 5886     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5887   %}
 5888   ins_pipe( pipe_slow );
 5889 %}
 5890 
 5891 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5892   predicate((UseAVX > 0) &&
 5893             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5894   match(Set dst (SubVS src (LoadVector mem)));
 5895   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5896   ins_encode %{
 5897     int vlen_enc = vector_length_encoding(this);
 5898     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5899   %}
 5900   ins_pipe( pipe_slow );
 5901 %}
 5902 
 5903 // Integers vector sub
 5904 instruct vsubI(vec dst, vec src) %{
 5905   predicate(UseAVX == 0);
 5906   match(Set dst (SubVI dst src));
 5907   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5908   ins_encode %{
 5909     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5910   %}
 5911   ins_pipe( pipe_slow );
 5912 %}
 5913 
 5914 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5915   predicate(UseAVX > 0);
 5916   match(Set dst (SubVI src1 src2));
 5917   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5918   ins_encode %{
 5919     int vlen_enc = vector_length_encoding(this);
 5920     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5921   %}
 5922   ins_pipe( pipe_slow );
 5923 %}
 5924 
 5925 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5926   predicate((UseAVX > 0) &&
 5927             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5928   match(Set dst (SubVI src (LoadVector mem)));
 5929   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5930   ins_encode %{
 5931     int vlen_enc = vector_length_encoding(this);
 5932     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5933   %}
 5934   ins_pipe( pipe_slow );
 5935 %}
 5936 
 5937 // Longs vector sub
 5938 instruct vsubL(vec dst, vec src) %{
 5939   predicate(UseAVX == 0);
 5940   match(Set dst (SubVL dst src));
 5941   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5942   ins_encode %{
 5943     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5944   %}
 5945   ins_pipe( pipe_slow );
 5946 %}
 5947 
 5948 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5949   predicate(UseAVX > 0);
 5950   match(Set dst (SubVL src1 src2));
 5951   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5952   ins_encode %{
 5953     int vlen_enc = vector_length_encoding(this);
 5954     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5955   %}
 5956   ins_pipe( pipe_slow );
 5957 %}
 5958 
 5959 
 5960 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5961   predicate((UseAVX > 0) &&
 5962             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5963   match(Set dst (SubVL src (LoadVector mem)));
 5964   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5965   ins_encode %{
 5966     int vlen_enc = vector_length_encoding(this);
 5967     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5968   %}
 5969   ins_pipe( pipe_slow );
 5970 %}
 5971 
 5972 // Floats vector sub
 5973 instruct vsubF(vec dst, vec src) %{
 5974   predicate(UseAVX == 0);
 5975   match(Set dst (SubVF dst src));
 5976   format %{ "subps   $dst,$src\t! sub packedF" %}
 5977   ins_encode %{
 5978     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5979   %}
 5980   ins_pipe( pipe_slow );
 5981 %}
 5982 
 5983 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5984   predicate(UseAVX > 0);
 5985   match(Set dst (SubVF src1 src2));
 5986   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5987   ins_encode %{
 5988     int vlen_enc = vector_length_encoding(this);
 5989     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5990   %}
 5991   ins_pipe( pipe_slow );
 5992 %}
 5993 
 5994 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5995   predicate((UseAVX > 0) &&
 5996             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5997   match(Set dst (SubVF src (LoadVector mem)));
 5998   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5999   ins_encode %{
 6000     int vlen_enc = vector_length_encoding(this);
 6001     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6002   %}
 6003   ins_pipe( pipe_slow );
 6004 %}
 6005 
 6006 // Doubles vector sub
 6007 instruct vsubD(vec dst, vec src) %{
 6008   predicate(UseAVX == 0);
 6009   match(Set dst (SubVD dst src));
 6010   format %{ "subpd   $dst,$src\t! sub packedD" %}
 6011   ins_encode %{
 6012     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 6013   %}
 6014   ins_pipe( pipe_slow );
 6015 %}
 6016 
 6017 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 6018   predicate(UseAVX > 0);
 6019   match(Set dst (SubVD src1 src2));
 6020   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 6021   ins_encode %{
 6022     int vlen_enc = vector_length_encoding(this);
 6023     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6024   %}
 6025   ins_pipe( pipe_slow );
 6026 %}
 6027 
 6028 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 6029   predicate((UseAVX > 0) &&
 6030             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6031   match(Set dst (SubVD src (LoadVector mem)));
 6032   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6033   ins_encode %{
 6034     int vlen_enc = vector_length_encoding(this);
 6035     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6036   %}
 6037   ins_pipe( pipe_slow );
 6038 %}
 6039 
 6040 // --------------------------------- MUL --------------------------------------
 6041 
 6042 // Byte vector mul
 6043 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6044   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6045   match(Set dst (MulVB src1 src2));
 6046   effect(TEMP dst, TEMP xtmp);
 6047   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6048   ins_encode %{
 6049     assert(UseSSE > 3, "required");
 6050     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6051     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6052     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6053     __ psllw($dst$$XMMRegister, 8);
 6054     __ psrlw($dst$$XMMRegister, 8);
 6055     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6056   %}
 6057   ins_pipe( pipe_slow );
 6058 %}
 6059 
 6060 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6061   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6062   match(Set dst (MulVB src1 src2));
 6063   effect(TEMP dst, TEMP xtmp);
 6064   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6065   ins_encode %{
 6066     assert(UseSSE > 3, "required");
 6067     // Odd-index elements
 6068     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6069     __ psrlw($dst$$XMMRegister, 8);
 6070     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6071     __ psrlw($xtmp$$XMMRegister, 8);
 6072     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6073     __ psllw($dst$$XMMRegister, 8);
 6074     // Even-index elements
 6075     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6076     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6077     __ psllw($xtmp$$XMMRegister, 8);
 6078     __ psrlw($xtmp$$XMMRegister, 8);
 6079     // Combine
 6080     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6081   %}
 6082   ins_pipe( pipe_slow );
 6083 %}
 6084 
 6085 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6086   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6087   match(Set dst (MulVB src1 src2));
 6088   effect(TEMP xtmp1, TEMP xtmp2);
 6089   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6090   ins_encode %{
 6091     int vlen_enc = vector_length_encoding(this);
 6092     // Odd-index elements
 6093     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6094     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6095     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6096     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6097     // Even-index elements
 6098     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6099     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6100     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6101     // Combine
 6102     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6103   %}
 6104   ins_pipe( pipe_slow );
 6105 %}
 6106 
 6107 // Shorts/Chars vector mul
 6108 instruct vmulS(vec dst, vec src) %{
 6109   predicate(UseAVX == 0);
 6110   match(Set dst (MulVS dst src));
 6111   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6112   ins_encode %{
 6113     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6114   %}
 6115   ins_pipe( pipe_slow );
 6116 %}
 6117 
 6118 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6119   predicate(UseAVX > 0);
 6120   match(Set dst (MulVS src1 src2));
 6121   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6122   ins_encode %{
 6123     int vlen_enc = vector_length_encoding(this);
 6124     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6125   %}
 6126   ins_pipe( pipe_slow );
 6127 %}
 6128 
 6129 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6130   predicate((UseAVX > 0) &&
 6131             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6132   match(Set dst (MulVS src (LoadVector mem)));
 6133   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6134   ins_encode %{
 6135     int vlen_enc = vector_length_encoding(this);
 6136     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6137   %}
 6138   ins_pipe( pipe_slow );
 6139 %}
 6140 
 6141 // Integers vector mul
 6142 instruct vmulI(vec dst, vec src) %{
 6143   predicate(UseAVX == 0);
 6144   match(Set dst (MulVI dst src));
 6145   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6146   ins_encode %{
 6147     assert(UseSSE > 3, "required");
 6148     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6149   %}
 6150   ins_pipe( pipe_slow );
 6151 %}
 6152 
 6153 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6154   predicate(UseAVX > 0);
 6155   match(Set dst (MulVI src1 src2));
 6156   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6157   ins_encode %{
 6158     int vlen_enc = vector_length_encoding(this);
 6159     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6160   %}
 6161   ins_pipe( pipe_slow );
 6162 %}
 6163 
 6164 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6165   predicate((UseAVX > 0) &&
 6166             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6167   match(Set dst (MulVI src (LoadVector mem)));
 6168   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6169   ins_encode %{
 6170     int vlen_enc = vector_length_encoding(this);
 6171     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6172   %}
 6173   ins_pipe( pipe_slow );
 6174 %}
 6175 
 6176 // Longs vector mul
 6177 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6178   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6179              VM_Version::supports_avx512dq()) ||
 6180             VM_Version::supports_avx512vldq());
 6181   match(Set dst (MulVL src1 src2));
 6182   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6183   ins_encode %{
 6184     assert(UseAVX > 2, "required");
 6185     int vlen_enc = vector_length_encoding(this);
 6186     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6187   %}
 6188   ins_pipe( pipe_slow );
 6189 %}
 6190 
 6191 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6192   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6193              VM_Version::supports_avx512dq()) ||
 6194             (Matcher::vector_length_in_bytes(n) > 8 &&
 6195              VM_Version::supports_avx512vldq()));
 6196   match(Set dst (MulVL src (LoadVector mem)));
 6197   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6198   ins_encode %{
 6199     assert(UseAVX > 2, "required");
 6200     int vlen_enc = vector_length_encoding(this);
 6201     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6202   %}
 6203   ins_pipe( pipe_slow );
 6204 %}
 6205 
 6206 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6207   predicate(UseAVX == 0);
 6208   match(Set dst (MulVL src1 src2));
 6209   effect(TEMP dst, TEMP xtmp);
 6210   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6211   ins_encode %{
 6212     assert(VM_Version::supports_sse4_1(), "required");
 6213     // Get the lo-hi products, only the lower 32 bits is in concerns
 6214     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6215     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6216     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6217     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6218     __ psllq($dst$$XMMRegister, 32);
 6219     // Get the lo-lo products
 6220     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6221     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6222     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6223   %}
 6224   ins_pipe( pipe_slow );
 6225 %}
 6226 
 6227 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6228   predicate(UseAVX > 0 &&
 6229             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6230               !VM_Version::supports_avx512dq()) ||
 6231              (Matcher::vector_length_in_bytes(n) < 64 &&
 6232               !VM_Version::supports_avx512vldq())));
 6233   match(Set dst (MulVL src1 src2));
 6234   effect(TEMP xtmp1, TEMP xtmp2);
 6235   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6236   ins_encode %{
 6237     int vlen_enc = vector_length_encoding(this);
 6238     // Get the lo-hi products, only the lower 32 bits is in concerns
 6239     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6240     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6241     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6242     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6243     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6244     // Get the lo-lo products
 6245     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6246     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6247   %}
 6248   ins_pipe( pipe_slow );
 6249 %}
 6250 
 6251 // Floats vector mul
 6252 instruct vmulF(vec dst, vec src) %{
 6253   predicate(UseAVX == 0);
 6254   match(Set dst (MulVF dst src));
 6255   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6256   ins_encode %{
 6257     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6258   %}
 6259   ins_pipe( pipe_slow );
 6260 %}
 6261 
 6262 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6263   predicate(UseAVX > 0);
 6264   match(Set dst (MulVF src1 src2));
 6265   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6266   ins_encode %{
 6267     int vlen_enc = vector_length_encoding(this);
 6268     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6269   %}
 6270   ins_pipe( pipe_slow );
 6271 %}
 6272 
 6273 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6274   predicate((UseAVX > 0) &&
 6275             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6276   match(Set dst (MulVF src (LoadVector mem)));
 6277   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6278   ins_encode %{
 6279     int vlen_enc = vector_length_encoding(this);
 6280     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6281   %}
 6282   ins_pipe( pipe_slow );
 6283 %}
 6284 
 6285 // Doubles vector mul
 6286 instruct vmulD(vec dst, vec src) %{
 6287   predicate(UseAVX == 0);
 6288   match(Set dst (MulVD dst src));
 6289   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6290   ins_encode %{
 6291     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6292   %}
 6293   ins_pipe( pipe_slow );
 6294 %}
 6295 
 6296 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6297   predicate(UseAVX > 0);
 6298   match(Set dst (MulVD src1 src2));
 6299   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6300   ins_encode %{
 6301     int vlen_enc = vector_length_encoding(this);
 6302     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6303   %}
 6304   ins_pipe( pipe_slow );
 6305 %}
 6306 
 6307 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6308   predicate((UseAVX > 0) &&
 6309             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6310   match(Set dst (MulVD src (LoadVector mem)));
 6311   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6312   ins_encode %{
 6313     int vlen_enc = vector_length_encoding(this);
 6314     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6315   %}
 6316   ins_pipe( pipe_slow );
 6317 %}
 6318 
 6319 // --------------------------------- DIV --------------------------------------
 6320 
 6321 // Floats vector div
 6322 instruct vdivF(vec dst, vec src) %{
 6323   predicate(UseAVX == 0);
 6324   match(Set dst (DivVF dst src));
 6325   format %{ "divps   $dst,$src\t! div packedF" %}
 6326   ins_encode %{
 6327     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6328   %}
 6329   ins_pipe( pipe_slow );
 6330 %}
 6331 
 6332 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6333   predicate(UseAVX > 0);
 6334   match(Set dst (DivVF src1 src2));
 6335   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6336   ins_encode %{
 6337     int vlen_enc = vector_length_encoding(this);
 6338     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6339   %}
 6340   ins_pipe( pipe_slow );
 6341 %}
 6342 
 6343 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6344   predicate((UseAVX > 0) &&
 6345             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6346   match(Set dst (DivVF src (LoadVector mem)));
 6347   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6348   ins_encode %{
 6349     int vlen_enc = vector_length_encoding(this);
 6350     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6351   %}
 6352   ins_pipe( pipe_slow );
 6353 %}
 6354 
 6355 // Doubles vector div
 6356 instruct vdivD(vec dst, vec src) %{
 6357   predicate(UseAVX == 0);
 6358   match(Set dst (DivVD dst src));
 6359   format %{ "divpd   $dst,$src\t! div packedD" %}
 6360   ins_encode %{
 6361     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6362   %}
 6363   ins_pipe( pipe_slow );
 6364 %}
 6365 
 6366 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6367   predicate(UseAVX > 0);
 6368   match(Set dst (DivVD src1 src2));
 6369   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6370   ins_encode %{
 6371     int vlen_enc = vector_length_encoding(this);
 6372     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6373   %}
 6374   ins_pipe( pipe_slow );
 6375 %}
 6376 
 6377 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6378   predicate((UseAVX > 0) &&
 6379             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6380   match(Set dst (DivVD src (LoadVector mem)));
 6381   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6382   ins_encode %{
 6383     int vlen_enc = vector_length_encoding(this);
 6384     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6385   %}
 6386   ins_pipe( pipe_slow );
 6387 %}
 6388 
 6389 // ------------------------------ MinMax ---------------------------------------
 6390 
 6391 // Byte, Short, Int vector Min/Max
 6392 instruct minmax_reg_sse(vec dst, vec src) %{
 6393   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6394             UseAVX == 0);
 6395   match(Set dst (MinV dst src));
 6396   match(Set dst (MaxV dst src));
 6397   format %{ "vector_minmax  $dst,$src\t!  " %}
 6398   ins_encode %{
 6399     assert(UseSSE >= 4, "required");
 6400 
 6401     int opcode = this->ideal_Opcode();
 6402     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6403     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6404   %}
 6405   ins_pipe( pipe_slow );
 6406 %}
 6407 
 6408 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6409   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6410             UseAVX > 0);
 6411   match(Set dst (MinV src1 src2));
 6412   match(Set dst (MaxV src1 src2));
 6413   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6414   ins_encode %{
 6415     int opcode = this->ideal_Opcode();
 6416     int vlen_enc = vector_length_encoding(this);
 6417     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6418 
 6419     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6420   %}
 6421   ins_pipe( pipe_slow );
 6422 %}
 6423 
 6424 // Long vector Min/Max
 6425 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6426   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6427             UseAVX == 0);
 6428   match(Set dst (MinV dst src));
 6429   match(Set dst (MaxV src dst));
 6430   effect(TEMP dst, TEMP tmp);
 6431   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6432   ins_encode %{
 6433     assert(UseSSE >= 4, "required");
 6434 
 6435     int opcode = this->ideal_Opcode();
 6436     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6437     assert(elem_bt == T_LONG, "sanity");
 6438 
 6439     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6440   %}
 6441   ins_pipe( pipe_slow );
 6442 %}
 6443 
 6444 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6445   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6446             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6447   match(Set dst (MinV src1 src2));
 6448   match(Set dst (MaxV src1 src2));
 6449   effect(TEMP dst);
 6450   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6451   ins_encode %{
 6452     int vlen_enc = vector_length_encoding(this);
 6453     int opcode = this->ideal_Opcode();
 6454     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6455     assert(elem_bt == T_LONG, "sanity");
 6456 
 6457     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6458   %}
 6459   ins_pipe( pipe_slow );
 6460 %}
 6461 
 6462 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6463   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6464             Matcher::vector_element_basic_type(n) == T_LONG);
 6465   match(Set dst (MinV src1 src2));
 6466   match(Set dst (MaxV src1 src2));
 6467   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6468   ins_encode %{
 6469     assert(UseAVX > 2, "required");
 6470 
 6471     int vlen_enc = vector_length_encoding(this);
 6472     int opcode = this->ideal_Opcode();
 6473     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6474     assert(elem_bt == T_LONG, "sanity");
 6475 
 6476     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6477   %}
 6478   ins_pipe( pipe_slow );
 6479 %}
 6480 
 6481 // Float/Double vector Min/Max
 6482 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6483   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6484             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6485             UseAVX > 0);
 6486   match(Set dst (MinV a b));
 6487   match(Set dst (MaxV a b));
 6488   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6489   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6490   ins_encode %{
 6491     assert(UseAVX > 0, "required");
 6492 
 6493     int opcode = this->ideal_Opcode();
 6494     int vlen_enc = vector_length_encoding(this);
 6495     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6496 
 6497     __ vminmax_fp(opcode, elem_bt,
 6498                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6499                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6500   %}
 6501   ins_pipe( pipe_slow );
 6502 %}
 6503 
 6504 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6505   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6506             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6507   match(Set dst (MinV a b));
 6508   match(Set dst (MaxV a b));
 6509   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6510   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6511   ins_encode %{
 6512     assert(UseAVX > 2, "required");
 6513 
 6514     int opcode = this->ideal_Opcode();
 6515     int vlen_enc = vector_length_encoding(this);
 6516     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6517 
 6518     __ evminmax_fp(opcode, elem_bt,
 6519                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6520                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6521   %}
 6522   ins_pipe( pipe_slow );
 6523 %}
 6524 
 6525 // ------------------------------ Unsigned vector Min/Max ----------------------
 6526 
 6527 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6528   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6529   match(Set dst (UMinV a b));
 6530   match(Set dst (UMaxV a b));
 6531   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6532   ins_encode %{
 6533     int opcode = this->ideal_Opcode();
 6534     int vlen_enc = vector_length_encoding(this);
 6535     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6536     assert(is_integral_type(elem_bt), "");
 6537     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6538   %}
 6539   ins_pipe( pipe_slow );
 6540 %}
 6541 
 6542 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6543   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6544   match(Set dst (UMinV a (LoadVector b)));
 6545   match(Set dst (UMaxV a (LoadVector b)));
 6546   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6547   ins_encode %{
 6548     int opcode = this->ideal_Opcode();
 6549     int vlen_enc = vector_length_encoding(this);
 6550     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6551     assert(is_integral_type(elem_bt), "");
 6552     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6553   %}
 6554   ins_pipe( pipe_slow );
 6555 %}
 6556 
 6557 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6558   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6559   match(Set dst (UMinV a b));
 6560   match(Set dst (UMaxV a b));
 6561   effect(TEMP xtmp1, TEMP xtmp2);
 6562   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6563   ins_encode %{
 6564     int opcode = this->ideal_Opcode();
 6565     int vlen_enc = vector_length_encoding(this);
 6566     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6567   %}
 6568   ins_pipe( pipe_slow );
 6569 %}
 6570 
 6571 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6572   match(Set dst (UMinV (Binary dst src2) mask));
 6573   match(Set dst (UMaxV (Binary dst src2) mask));
 6574   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6575   ins_encode %{
 6576     int vlen_enc = vector_length_encoding(this);
 6577     BasicType bt = Matcher::vector_element_basic_type(this);
 6578     int opc = this->ideal_Opcode();
 6579     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6580                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6581   %}
 6582   ins_pipe( pipe_slow );
 6583 %}
 6584 
 6585 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6586   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6587   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6588   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6589   ins_encode %{
 6590     int vlen_enc = vector_length_encoding(this);
 6591     BasicType bt = Matcher::vector_element_basic_type(this);
 6592     int opc = this->ideal_Opcode();
 6593     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6594                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6595   %}
 6596   ins_pipe( pipe_slow );
 6597 %}
 6598 
 6599 // --------------------------------- Signum/CopySign ---------------------------
 6600 
 6601 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6602   match(Set dst (SignumF dst (Binary zero one)));
 6603   effect(KILL cr);
 6604   format %{ "signumF $dst, $dst" %}
 6605   ins_encode %{
 6606     int opcode = this->ideal_Opcode();
 6607     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6608   %}
 6609   ins_pipe( pipe_slow );
 6610 %}
 6611 
 6612 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6613   match(Set dst (SignumD dst (Binary zero one)));
 6614   effect(KILL cr);
 6615   format %{ "signumD $dst, $dst" %}
 6616   ins_encode %{
 6617     int opcode = this->ideal_Opcode();
 6618     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6619   %}
 6620   ins_pipe( pipe_slow );
 6621 %}
 6622 
 6623 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6624   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6625   match(Set dst (SignumVF src (Binary zero one)));
 6626   match(Set dst (SignumVD src (Binary zero one)));
 6627   effect(TEMP dst, TEMP xtmp1);
 6628   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6629   ins_encode %{
 6630     int opcode = this->ideal_Opcode();
 6631     int vec_enc = vector_length_encoding(this);
 6632     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6633                          $xtmp1$$XMMRegister, vec_enc);
 6634   %}
 6635   ins_pipe( pipe_slow );
 6636 %}
 6637 
 6638 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6639   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6640   match(Set dst (SignumVF src (Binary zero one)));
 6641   match(Set dst (SignumVD src (Binary zero one)));
 6642   effect(TEMP dst, TEMP ktmp1);
 6643   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6644   ins_encode %{
 6645     int opcode = this->ideal_Opcode();
 6646     int vec_enc = vector_length_encoding(this);
 6647     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6648                           $ktmp1$$KRegister, vec_enc);
 6649   %}
 6650   ins_pipe( pipe_slow );
 6651 %}
 6652 
 6653 // ---------------------------------------
 6654 // For copySign use 0xE4 as writemask for vpternlog
 6655 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6656 // C (xmm2) is set to 0x7FFFFFFF
 6657 // Wherever xmm2 is 0, we want to pick from B (sign)
 6658 // Wherever xmm2 is 1, we want to pick from A (src)
 6659 //
 6660 // A B C Result
 6661 // 0 0 0 0
 6662 // 0 0 1 0
 6663 // 0 1 0 1
 6664 // 0 1 1 0
 6665 // 1 0 0 0
 6666 // 1 0 1 1
 6667 // 1 1 0 1
 6668 // 1 1 1 1
 6669 //
 6670 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6671 // ---------------------------------------
 6672 
 6673 #ifdef _LP64
 6674 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6675   match(Set dst (CopySignF dst src));
 6676   effect(TEMP tmp1, TEMP tmp2);
 6677   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6678   ins_encode %{
 6679     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6680     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6681     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6682   %}
 6683   ins_pipe( pipe_slow );
 6684 %}
 6685 
 6686 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6687   match(Set dst (CopySignD dst (Binary src zero)));
 6688   ins_cost(100);
 6689   effect(TEMP tmp1, TEMP tmp2);
 6690   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6691   ins_encode %{
 6692     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6693     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6694     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6695   %}
 6696   ins_pipe( pipe_slow );
 6697 %}
 6698 
 6699 #endif // _LP64
 6700 
 6701 //----------------------------- CompressBits/ExpandBits ------------------------
 6702 
 6703 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6704   predicate(n->bottom_type()->isa_int());
 6705   match(Set dst (CompressBits src mask));
 6706   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6707   ins_encode %{
 6708     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6709   %}
 6710   ins_pipe( pipe_slow );
 6711 %}
 6712 
 6713 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6714   predicate(n->bottom_type()->isa_int());
 6715   match(Set dst (ExpandBits src mask));
 6716   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6717   ins_encode %{
 6718     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6719   %}
 6720   ins_pipe( pipe_slow );
 6721 %}
 6722 
 6723 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6724   predicate(n->bottom_type()->isa_int());
 6725   match(Set dst (CompressBits src (LoadI mask)));
 6726   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6727   ins_encode %{
 6728     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6729   %}
 6730   ins_pipe( pipe_slow );
 6731 %}
 6732 
 6733 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6734   predicate(n->bottom_type()->isa_int());
 6735   match(Set dst (ExpandBits src (LoadI mask)));
 6736   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6737   ins_encode %{
 6738     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6739   %}
 6740   ins_pipe( pipe_slow );
 6741 %}
 6742 
 6743 // --------------------------------- Sqrt --------------------------------------
 6744 
 6745 instruct vsqrtF_reg(vec dst, vec src) %{
 6746   match(Set dst (SqrtVF src));
 6747   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6748   ins_encode %{
 6749     assert(UseAVX > 0, "required");
 6750     int vlen_enc = vector_length_encoding(this);
 6751     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6752   %}
 6753   ins_pipe( pipe_slow );
 6754 %}
 6755 
 6756 instruct vsqrtF_mem(vec dst, memory mem) %{
 6757   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6758   match(Set dst (SqrtVF (LoadVector mem)));
 6759   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6760   ins_encode %{
 6761     assert(UseAVX > 0, "required");
 6762     int vlen_enc = vector_length_encoding(this);
 6763     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6764   %}
 6765   ins_pipe( pipe_slow );
 6766 %}
 6767 
 6768 // Floating point vector sqrt
 6769 instruct vsqrtD_reg(vec dst, vec src) %{
 6770   match(Set dst (SqrtVD src));
 6771   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6772   ins_encode %{
 6773     assert(UseAVX > 0, "required");
 6774     int vlen_enc = vector_length_encoding(this);
 6775     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6776   %}
 6777   ins_pipe( pipe_slow );
 6778 %}
 6779 
 6780 instruct vsqrtD_mem(vec dst, memory mem) %{
 6781   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6782   match(Set dst (SqrtVD (LoadVector mem)));
 6783   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6784   ins_encode %{
 6785     assert(UseAVX > 0, "required");
 6786     int vlen_enc = vector_length_encoding(this);
 6787     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6788   %}
 6789   ins_pipe( pipe_slow );
 6790 %}
 6791 
 6792 // ------------------------------ Shift ---------------------------------------
 6793 
 6794 // Left and right shift count vectors are the same on x86
 6795 // (only lowest bits of xmm reg are used for count).
 6796 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6797   match(Set dst (LShiftCntV cnt));
 6798   match(Set dst (RShiftCntV cnt));
 6799   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6800   ins_encode %{
 6801     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6802   %}
 6803   ins_pipe( pipe_slow );
 6804 %}
 6805 
 6806 // Byte vector shift
 6807 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6808   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6809   match(Set dst ( LShiftVB src shift));
 6810   match(Set dst ( RShiftVB src shift));
 6811   match(Set dst (URShiftVB src shift));
 6812   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6813   format %{"vector_byte_shift $dst,$src,$shift" %}
 6814   ins_encode %{
 6815     assert(UseSSE > 3, "required");
 6816     int opcode = this->ideal_Opcode();
 6817     bool sign = (opcode != Op_URShiftVB);
 6818     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6819     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6820     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6821     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6822     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6823   %}
 6824   ins_pipe( pipe_slow );
 6825 %}
 6826 
 6827 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6828   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6829             UseAVX <= 1);
 6830   match(Set dst ( LShiftVB src shift));
 6831   match(Set dst ( RShiftVB src shift));
 6832   match(Set dst (URShiftVB src shift));
 6833   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6834   format %{"vector_byte_shift $dst,$src,$shift" %}
 6835   ins_encode %{
 6836     assert(UseSSE > 3, "required");
 6837     int opcode = this->ideal_Opcode();
 6838     bool sign = (opcode != Op_URShiftVB);
 6839     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6840     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6841     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6842     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6843     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6844     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6845     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6846     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6847     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6848   %}
 6849   ins_pipe( pipe_slow );
 6850 %}
 6851 
 6852 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6853   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6854             UseAVX > 1);
 6855   match(Set dst ( LShiftVB src shift));
 6856   match(Set dst ( RShiftVB src shift));
 6857   match(Set dst (URShiftVB src shift));
 6858   effect(TEMP dst, TEMP tmp);
 6859   format %{"vector_byte_shift $dst,$src,$shift" %}
 6860   ins_encode %{
 6861     int opcode = this->ideal_Opcode();
 6862     bool sign = (opcode != Op_URShiftVB);
 6863     int vlen_enc = Assembler::AVX_256bit;
 6864     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6865     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6866     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6867     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6868     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6869   %}
 6870   ins_pipe( pipe_slow );
 6871 %}
 6872 
 6873 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6874   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6875   match(Set dst ( LShiftVB src shift));
 6876   match(Set dst ( RShiftVB src shift));
 6877   match(Set dst (URShiftVB src shift));
 6878   effect(TEMP dst, TEMP tmp);
 6879   format %{"vector_byte_shift $dst,$src,$shift" %}
 6880   ins_encode %{
 6881     assert(UseAVX > 1, "required");
 6882     int opcode = this->ideal_Opcode();
 6883     bool sign = (opcode != Op_URShiftVB);
 6884     int vlen_enc = Assembler::AVX_256bit;
 6885     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6886     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6887     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6888     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6889     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6890     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6891     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6892     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6893     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6894   %}
 6895   ins_pipe( pipe_slow );
 6896 %}
 6897 
 6898 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6899   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6900   match(Set dst ( LShiftVB src shift));
 6901   match(Set dst  (RShiftVB src shift));
 6902   match(Set dst (URShiftVB src shift));
 6903   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6904   format %{"vector_byte_shift $dst,$src,$shift" %}
 6905   ins_encode %{
 6906     assert(UseAVX > 2, "required");
 6907     int opcode = this->ideal_Opcode();
 6908     bool sign = (opcode != Op_URShiftVB);
 6909     int vlen_enc = Assembler::AVX_512bit;
 6910     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6911     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6912     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6913     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6914     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6915     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6916     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6917     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6918     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6919     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6920     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6921     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6922   %}
 6923   ins_pipe( pipe_slow );
 6924 %}
 6925 
 6926 // Shorts vector logical right shift produces incorrect Java result
 6927 // for negative data because java code convert short value into int with
 6928 // sign extension before a shift. But char vectors are fine since chars are
 6929 // unsigned values.
 6930 // Shorts/Chars vector left shift
 6931 instruct vshiftS(vec dst, vec src, vec shift) %{
 6932   predicate(!n->as_ShiftV()->is_var_shift());
 6933   match(Set dst ( LShiftVS src shift));
 6934   match(Set dst ( RShiftVS src shift));
 6935   match(Set dst (URShiftVS src shift));
 6936   effect(TEMP dst, USE src, USE shift);
 6937   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6938   ins_encode %{
 6939     int opcode = this->ideal_Opcode();
 6940     if (UseAVX > 0) {
 6941       int vlen_enc = vector_length_encoding(this);
 6942       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6943     } else {
 6944       int vlen = Matcher::vector_length(this);
 6945       if (vlen == 2) {
 6946         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6947         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6948       } else if (vlen == 4) {
 6949         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6950         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6951       } else {
 6952         assert (vlen == 8, "sanity");
 6953         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6954         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6955       }
 6956     }
 6957   %}
 6958   ins_pipe( pipe_slow );
 6959 %}
 6960 
 6961 // Integers vector left shift
 6962 instruct vshiftI(vec dst, vec src, vec shift) %{
 6963   predicate(!n->as_ShiftV()->is_var_shift());
 6964   match(Set dst ( LShiftVI src shift));
 6965   match(Set dst ( RShiftVI src shift));
 6966   match(Set dst (URShiftVI src shift));
 6967   effect(TEMP dst, USE src, USE shift);
 6968   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6969   ins_encode %{
 6970     int opcode = this->ideal_Opcode();
 6971     if (UseAVX > 0) {
 6972       int vlen_enc = vector_length_encoding(this);
 6973       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6974     } else {
 6975       int vlen = Matcher::vector_length(this);
 6976       if (vlen == 2) {
 6977         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6978         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6979       } else {
 6980         assert(vlen == 4, "sanity");
 6981         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6982         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6983       }
 6984     }
 6985   %}
 6986   ins_pipe( pipe_slow );
 6987 %}
 6988 
 6989 // Integers vector left constant shift
 6990 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6991   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6992   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6993   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6994   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6995   ins_encode %{
 6996     int opcode = this->ideal_Opcode();
 6997     if (UseAVX > 0) {
 6998       int vector_len = vector_length_encoding(this);
 6999       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7000     } else {
 7001       int vlen = Matcher::vector_length(this);
 7002       if (vlen == 2) {
 7003         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7004         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7005       } else {
 7006         assert(vlen == 4, "sanity");
 7007         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7008         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7009       }
 7010     }
 7011   %}
 7012   ins_pipe( pipe_slow );
 7013 %}
 7014 
 7015 // Longs vector shift
 7016 instruct vshiftL(vec dst, vec src, vec shift) %{
 7017   predicate(!n->as_ShiftV()->is_var_shift());
 7018   match(Set dst ( LShiftVL src shift));
 7019   match(Set dst (URShiftVL src shift));
 7020   effect(TEMP dst, USE src, USE shift);
 7021   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 7022   ins_encode %{
 7023     int opcode = this->ideal_Opcode();
 7024     if (UseAVX > 0) {
 7025       int vlen_enc = vector_length_encoding(this);
 7026       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7027     } else {
 7028       assert(Matcher::vector_length(this) == 2, "");
 7029       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7030       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7031     }
 7032   %}
 7033   ins_pipe( pipe_slow );
 7034 %}
 7035 
 7036 // Longs vector constant shift
 7037 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 7038   match(Set dst (LShiftVL src (LShiftCntV shift)));
 7039   match(Set dst (URShiftVL src (RShiftCntV shift)));
 7040   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 7041   ins_encode %{
 7042     int opcode = this->ideal_Opcode();
 7043     if (UseAVX > 0) {
 7044       int vector_len = vector_length_encoding(this);
 7045       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7046     } else {
 7047       assert(Matcher::vector_length(this) == 2, "");
 7048       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7049       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7050     }
 7051   %}
 7052   ins_pipe( pipe_slow );
 7053 %}
 7054 
 7055 // -------------------ArithmeticRightShift -----------------------------------
 7056 // Long vector arithmetic right shift
 7057 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 7058   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 7059   match(Set dst (RShiftVL src shift));
 7060   effect(TEMP dst, TEMP tmp);
 7061   format %{ "vshiftq $dst,$src,$shift" %}
 7062   ins_encode %{
 7063     uint vlen = Matcher::vector_length(this);
 7064     if (vlen == 2) {
 7065       assert(UseSSE >= 2, "required");
 7066       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7067       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7068       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7069       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7070       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7071       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7072     } else {
 7073       assert(vlen == 4, "sanity");
 7074       assert(UseAVX > 1, "required");
 7075       int vlen_enc = Assembler::AVX_256bit;
 7076       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7077       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7078       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7079       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7080       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7081     }
 7082   %}
 7083   ins_pipe( pipe_slow );
 7084 %}
 7085 
 7086 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7087   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7088   match(Set dst (RShiftVL src shift));
 7089   format %{ "vshiftq $dst,$src,$shift" %}
 7090   ins_encode %{
 7091     int vlen_enc = vector_length_encoding(this);
 7092     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7093   %}
 7094   ins_pipe( pipe_slow );
 7095 %}
 7096 
 7097 // ------------------- Variable Shift -----------------------------
 7098 // Byte variable shift
 7099 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7100   predicate(Matcher::vector_length(n) <= 8 &&
 7101             n->as_ShiftV()->is_var_shift() &&
 7102             !VM_Version::supports_avx512bw());
 7103   match(Set dst ( LShiftVB src shift));
 7104   match(Set dst ( RShiftVB src shift));
 7105   match(Set dst (URShiftVB src shift));
 7106   effect(TEMP dst, TEMP vtmp);
 7107   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7108   ins_encode %{
 7109     assert(UseAVX >= 2, "required");
 7110 
 7111     int opcode = this->ideal_Opcode();
 7112     int vlen_enc = Assembler::AVX_128bit;
 7113     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7114     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7115   %}
 7116   ins_pipe( pipe_slow );
 7117 %}
 7118 
 7119 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7120   predicate(Matcher::vector_length(n) == 16 &&
 7121             n->as_ShiftV()->is_var_shift() &&
 7122             !VM_Version::supports_avx512bw());
 7123   match(Set dst ( LShiftVB src shift));
 7124   match(Set dst ( RShiftVB src shift));
 7125   match(Set dst (URShiftVB src shift));
 7126   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7127   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7128   ins_encode %{
 7129     assert(UseAVX >= 2, "required");
 7130 
 7131     int opcode = this->ideal_Opcode();
 7132     int vlen_enc = Assembler::AVX_128bit;
 7133     // Shift lower half and get word result in dst
 7134     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7135 
 7136     // Shift upper half and get word result in vtmp1
 7137     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7138     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7139     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7140 
 7141     // Merge and down convert the two word results to byte in dst
 7142     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7143   %}
 7144   ins_pipe( pipe_slow );
 7145 %}
 7146 
 7147 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7148   predicate(Matcher::vector_length(n) == 32 &&
 7149             n->as_ShiftV()->is_var_shift() &&
 7150             !VM_Version::supports_avx512bw());
 7151   match(Set dst ( LShiftVB src shift));
 7152   match(Set dst ( RShiftVB src shift));
 7153   match(Set dst (URShiftVB src shift));
 7154   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7155   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7156   ins_encode %{
 7157     assert(UseAVX >= 2, "required");
 7158 
 7159     int opcode = this->ideal_Opcode();
 7160     int vlen_enc = Assembler::AVX_128bit;
 7161     // Process lower 128 bits and get result in dst
 7162     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7163     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7164     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7165     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7166     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7167 
 7168     // Process higher 128 bits and get result in vtmp3
 7169     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7170     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7171     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7172     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7173     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7174     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7175     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7176 
 7177     // Merge the two results in dst
 7178     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7179   %}
 7180   ins_pipe( pipe_slow );
 7181 %}
 7182 
 7183 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7184   predicate(Matcher::vector_length(n) <= 32 &&
 7185             n->as_ShiftV()->is_var_shift() &&
 7186             VM_Version::supports_avx512bw());
 7187   match(Set dst ( LShiftVB src shift));
 7188   match(Set dst ( RShiftVB src shift));
 7189   match(Set dst (URShiftVB src shift));
 7190   effect(TEMP dst, TEMP vtmp);
 7191   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7192   ins_encode %{
 7193     assert(UseAVX > 2, "required");
 7194 
 7195     int opcode = this->ideal_Opcode();
 7196     int vlen_enc = vector_length_encoding(this);
 7197     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7198   %}
 7199   ins_pipe( pipe_slow );
 7200 %}
 7201 
 7202 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7203   predicate(Matcher::vector_length(n) == 64 &&
 7204             n->as_ShiftV()->is_var_shift() &&
 7205             VM_Version::supports_avx512bw());
 7206   match(Set dst ( LShiftVB src shift));
 7207   match(Set dst ( RShiftVB src shift));
 7208   match(Set dst (URShiftVB src shift));
 7209   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7210   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7211   ins_encode %{
 7212     assert(UseAVX > 2, "required");
 7213 
 7214     int opcode = this->ideal_Opcode();
 7215     int vlen_enc = Assembler::AVX_256bit;
 7216     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7217     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7218     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7219     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7220     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7221   %}
 7222   ins_pipe( pipe_slow );
 7223 %}
 7224 
 7225 // Short variable shift
 7226 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7227   predicate(Matcher::vector_length(n) <= 8 &&
 7228             n->as_ShiftV()->is_var_shift() &&
 7229             !VM_Version::supports_avx512bw());
 7230   match(Set dst ( LShiftVS src shift));
 7231   match(Set dst ( RShiftVS src shift));
 7232   match(Set dst (URShiftVS src shift));
 7233   effect(TEMP dst, TEMP vtmp);
 7234   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7235   ins_encode %{
 7236     assert(UseAVX >= 2, "required");
 7237 
 7238     int opcode = this->ideal_Opcode();
 7239     bool sign = (opcode != Op_URShiftVS);
 7240     int vlen_enc = Assembler::AVX_256bit;
 7241     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7242     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7243     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7244     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7245     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7246     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7247   %}
 7248   ins_pipe( pipe_slow );
 7249 %}
 7250 
 7251 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7252   predicate(Matcher::vector_length(n) == 16 &&
 7253             n->as_ShiftV()->is_var_shift() &&
 7254             !VM_Version::supports_avx512bw());
 7255   match(Set dst ( LShiftVS src shift));
 7256   match(Set dst ( RShiftVS src shift));
 7257   match(Set dst (URShiftVS src shift));
 7258   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7259   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7260   ins_encode %{
 7261     assert(UseAVX >= 2, "required");
 7262 
 7263     int opcode = this->ideal_Opcode();
 7264     bool sign = (opcode != Op_URShiftVS);
 7265     int vlen_enc = Assembler::AVX_256bit;
 7266     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7267     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7268     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7269     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7270     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7271 
 7272     // Shift upper half, with result in dst using vtmp1 as TEMP
 7273     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7274     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7275     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7276     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7277     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7278     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7279 
 7280     // Merge lower and upper half result into dst
 7281     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7282     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7283   %}
 7284   ins_pipe( pipe_slow );
 7285 %}
 7286 
 7287 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7288   predicate(n->as_ShiftV()->is_var_shift() &&
 7289             VM_Version::supports_avx512bw());
 7290   match(Set dst ( LShiftVS src shift));
 7291   match(Set dst ( RShiftVS src shift));
 7292   match(Set dst (URShiftVS src shift));
 7293   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7294   ins_encode %{
 7295     assert(UseAVX > 2, "required");
 7296 
 7297     int opcode = this->ideal_Opcode();
 7298     int vlen_enc = vector_length_encoding(this);
 7299     if (!VM_Version::supports_avx512vl()) {
 7300       vlen_enc = Assembler::AVX_512bit;
 7301     }
 7302     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7303   %}
 7304   ins_pipe( pipe_slow );
 7305 %}
 7306 
 7307 //Integer variable shift
 7308 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7309   predicate(n->as_ShiftV()->is_var_shift());
 7310   match(Set dst ( LShiftVI src shift));
 7311   match(Set dst ( RShiftVI src shift));
 7312   match(Set dst (URShiftVI src shift));
 7313   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7314   ins_encode %{
 7315     assert(UseAVX >= 2, "required");
 7316 
 7317     int opcode = this->ideal_Opcode();
 7318     int vlen_enc = vector_length_encoding(this);
 7319     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7320   %}
 7321   ins_pipe( pipe_slow );
 7322 %}
 7323 
 7324 //Long variable shift
 7325 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7326   predicate(n->as_ShiftV()->is_var_shift());
 7327   match(Set dst ( LShiftVL src shift));
 7328   match(Set dst (URShiftVL src shift));
 7329   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7330   ins_encode %{
 7331     assert(UseAVX >= 2, "required");
 7332 
 7333     int opcode = this->ideal_Opcode();
 7334     int vlen_enc = vector_length_encoding(this);
 7335     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7336   %}
 7337   ins_pipe( pipe_slow );
 7338 %}
 7339 
 7340 //Long variable right shift arithmetic
 7341 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7342   predicate(Matcher::vector_length(n) <= 4 &&
 7343             n->as_ShiftV()->is_var_shift() &&
 7344             UseAVX == 2);
 7345   match(Set dst (RShiftVL src shift));
 7346   effect(TEMP dst, TEMP vtmp);
 7347   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7348   ins_encode %{
 7349     int opcode = this->ideal_Opcode();
 7350     int vlen_enc = vector_length_encoding(this);
 7351     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7352                  $vtmp$$XMMRegister);
 7353   %}
 7354   ins_pipe( pipe_slow );
 7355 %}
 7356 
 7357 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7358   predicate(n->as_ShiftV()->is_var_shift() &&
 7359             UseAVX > 2);
 7360   match(Set dst (RShiftVL src shift));
 7361   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7362   ins_encode %{
 7363     int opcode = this->ideal_Opcode();
 7364     int vlen_enc = vector_length_encoding(this);
 7365     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7366   %}
 7367   ins_pipe( pipe_slow );
 7368 %}
 7369 
 7370 // --------------------------------- AND --------------------------------------
 7371 
 7372 instruct vand(vec dst, vec src) %{
 7373   predicate(UseAVX == 0);
 7374   match(Set dst (AndV dst src));
 7375   format %{ "pand    $dst,$src\t! and vectors" %}
 7376   ins_encode %{
 7377     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7378   %}
 7379   ins_pipe( pipe_slow );
 7380 %}
 7381 
 7382 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7383   predicate(UseAVX > 0);
 7384   match(Set dst (AndV src1 src2));
 7385   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7386   ins_encode %{
 7387     int vlen_enc = vector_length_encoding(this);
 7388     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7389   %}
 7390   ins_pipe( pipe_slow );
 7391 %}
 7392 
 7393 instruct vand_mem(vec dst, vec src, memory mem) %{
 7394   predicate((UseAVX > 0) &&
 7395             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7396   match(Set dst (AndV src (LoadVector mem)));
 7397   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7398   ins_encode %{
 7399     int vlen_enc = vector_length_encoding(this);
 7400     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7401   %}
 7402   ins_pipe( pipe_slow );
 7403 %}
 7404 
 7405 // --------------------------------- OR ---------------------------------------
 7406 
 7407 instruct vor(vec dst, vec src) %{
 7408   predicate(UseAVX == 0);
 7409   match(Set dst (OrV dst src));
 7410   format %{ "por     $dst,$src\t! or vectors" %}
 7411   ins_encode %{
 7412     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7413   %}
 7414   ins_pipe( pipe_slow );
 7415 %}
 7416 
 7417 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7418   predicate(UseAVX > 0);
 7419   match(Set dst (OrV src1 src2));
 7420   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7421   ins_encode %{
 7422     int vlen_enc = vector_length_encoding(this);
 7423     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7424   %}
 7425   ins_pipe( pipe_slow );
 7426 %}
 7427 
 7428 instruct vor_mem(vec dst, vec src, memory mem) %{
 7429   predicate((UseAVX > 0) &&
 7430             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7431   match(Set dst (OrV src (LoadVector mem)));
 7432   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7433   ins_encode %{
 7434     int vlen_enc = vector_length_encoding(this);
 7435     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7436   %}
 7437   ins_pipe( pipe_slow );
 7438 %}
 7439 
 7440 // --------------------------------- XOR --------------------------------------
 7441 
 7442 instruct vxor(vec dst, vec src) %{
 7443   predicate(UseAVX == 0);
 7444   match(Set dst (XorV dst src));
 7445   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7446   ins_encode %{
 7447     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7448   %}
 7449   ins_pipe( pipe_slow );
 7450 %}
 7451 
 7452 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7453   predicate(UseAVX > 0);
 7454   match(Set dst (XorV src1 src2));
 7455   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7456   ins_encode %{
 7457     int vlen_enc = vector_length_encoding(this);
 7458     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7459   %}
 7460   ins_pipe( pipe_slow );
 7461 %}
 7462 
 7463 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7464   predicate((UseAVX > 0) &&
 7465             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7466   match(Set dst (XorV src (LoadVector mem)));
 7467   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7468   ins_encode %{
 7469     int vlen_enc = vector_length_encoding(this);
 7470     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7471   %}
 7472   ins_pipe( pipe_slow );
 7473 %}
 7474 
 7475 // --------------------------------- VectorCast --------------------------------------
 7476 
 7477 instruct vcastBtoX(vec dst, vec src) %{
 7478   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7479   match(Set dst (VectorCastB2X src));
 7480   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7481   ins_encode %{
 7482     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7483     int vlen_enc = vector_length_encoding(this);
 7484     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7485   %}
 7486   ins_pipe( pipe_slow );
 7487 %}
 7488 
 7489 instruct vcastBtoD(legVec dst, legVec src) %{
 7490   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7491   match(Set dst (VectorCastB2X src));
 7492   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7493   ins_encode %{
 7494     int vlen_enc = vector_length_encoding(this);
 7495     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7496   %}
 7497   ins_pipe( pipe_slow );
 7498 %}
 7499 
 7500 instruct castStoX(vec dst, vec src) %{
 7501   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7502             Matcher::vector_length(n->in(1)) <= 8 && // src
 7503             Matcher::vector_element_basic_type(n) == T_BYTE);
 7504   match(Set dst (VectorCastS2X src));
 7505   format %{ "vector_cast_s2x $dst,$src" %}
 7506   ins_encode %{
 7507     assert(UseAVX > 0, "required");
 7508 
 7509     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7510     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7511   %}
 7512   ins_pipe( pipe_slow );
 7513 %}
 7514 
 7515 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7516   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7517             Matcher::vector_length(n->in(1)) == 16 && // src
 7518             Matcher::vector_element_basic_type(n) == T_BYTE);
 7519   effect(TEMP dst, TEMP vtmp);
 7520   match(Set dst (VectorCastS2X src));
 7521   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7522   ins_encode %{
 7523     assert(UseAVX > 0, "required");
 7524 
 7525     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7526     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7527     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7528     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7529   %}
 7530   ins_pipe( pipe_slow );
 7531 %}
 7532 
 7533 instruct vcastStoX_evex(vec dst, vec src) %{
 7534   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7535             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7536   match(Set dst (VectorCastS2X src));
 7537   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7538   ins_encode %{
 7539     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7540     int src_vlen_enc = vector_length_encoding(this, $src);
 7541     int vlen_enc = vector_length_encoding(this);
 7542     switch (to_elem_bt) {
 7543       case T_BYTE:
 7544         if (!VM_Version::supports_avx512vl()) {
 7545           vlen_enc = Assembler::AVX_512bit;
 7546         }
 7547         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7548         break;
 7549       case T_INT:
 7550         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7551         break;
 7552       case T_FLOAT:
 7553         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7554         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7555         break;
 7556       case T_LONG:
 7557         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7558         break;
 7559       case T_DOUBLE: {
 7560         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7561         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7562         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7563         break;
 7564       }
 7565       default:
 7566         ShouldNotReachHere();
 7567     }
 7568   %}
 7569   ins_pipe( pipe_slow );
 7570 %}
 7571 
 7572 instruct castItoX(vec dst, vec src) %{
 7573   predicate(UseAVX <= 2 &&
 7574             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7575             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7576   match(Set dst (VectorCastI2X src));
 7577   format %{ "vector_cast_i2x $dst,$src" %}
 7578   ins_encode %{
 7579     assert(UseAVX > 0, "required");
 7580 
 7581     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7582     int vlen_enc = vector_length_encoding(this, $src);
 7583 
 7584     if (to_elem_bt == T_BYTE) {
 7585       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7586       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7587       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7588     } else {
 7589       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7590       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7591       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7592     }
 7593   %}
 7594   ins_pipe( pipe_slow );
 7595 %}
 7596 
 7597 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7598   predicate(UseAVX <= 2 &&
 7599             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7600             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7601   match(Set dst (VectorCastI2X src));
 7602   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7603   effect(TEMP dst, TEMP vtmp);
 7604   ins_encode %{
 7605     assert(UseAVX > 0, "required");
 7606 
 7607     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7608     int vlen_enc = vector_length_encoding(this, $src);
 7609 
 7610     if (to_elem_bt == T_BYTE) {
 7611       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7612       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7613       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7614       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7615     } else {
 7616       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7617       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7618       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7619       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7620     }
 7621   %}
 7622   ins_pipe( pipe_slow );
 7623 %}
 7624 
 7625 instruct vcastItoX_evex(vec dst, vec src) %{
 7626   predicate(UseAVX > 2 ||
 7627             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7628   match(Set dst (VectorCastI2X src));
 7629   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7630   ins_encode %{
 7631     assert(UseAVX > 0, "required");
 7632 
 7633     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7634     int src_vlen_enc = vector_length_encoding(this, $src);
 7635     int dst_vlen_enc = vector_length_encoding(this);
 7636     switch (dst_elem_bt) {
 7637       case T_BYTE:
 7638         if (!VM_Version::supports_avx512vl()) {
 7639           src_vlen_enc = Assembler::AVX_512bit;
 7640         }
 7641         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7642         break;
 7643       case T_SHORT:
 7644         if (!VM_Version::supports_avx512vl()) {
 7645           src_vlen_enc = Assembler::AVX_512bit;
 7646         }
 7647         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7648         break;
 7649       case T_FLOAT:
 7650         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7651         break;
 7652       case T_LONG:
 7653         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7654         break;
 7655       case T_DOUBLE:
 7656         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7657         break;
 7658       default:
 7659         ShouldNotReachHere();
 7660     }
 7661   %}
 7662   ins_pipe( pipe_slow );
 7663 %}
 7664 
 7665 instruct vcastLtoBS(vec dst, vec src) %{
 7666   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7667             UseAVX <= 2);
 7668   match(Set dst (VectorCastL2X src));
 7669   format %{ "vector_cast_l2x  $dst,$src" %}
 7670   ins_encode %{
 7671     assert(UseAVX > 0, "required");
 7672 
 7673     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7674     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7675     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7676                                                       : ExternalAddress(vector_int_to_short_mask());
 7677     if (vlen <= 16) {
 7678       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7679       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7680       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7681     } else {
 7682       assert(vlen <= 32, "required");
 7683       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7684       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7685       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7686       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7687     }
 7688     if (to_elem_bt == T_BYTE) {
 7689       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7690     }
 7691   %}
 7692   ins_pipe( pipe_slow );
 7693 %}
 7694 
 7695 instruct vcastLtoX_evex(vec dst, vec src) %{
 7696   predicate(UseAVX > 2 ||
 7697             (Matcher::vector_element_basic_type(n) == T_INT ||
 7698              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7699              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7700   match(Set dst (VectorCastL2X src));
 7701   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7702   ins_encode %{
 7703     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7704     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7705     int vlen_enc = vector_length_encoding(this, $src);
 7706     switch (to_elem_bt) {
 7707       case T_BYTE:
 7708         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7709           vlen_enc = Assembler::AVX_512bit;
 7710         }
 7711         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7712         break;
 7713       case T_SHORT:
 7714         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7715           vlen_enc = Assembler::AVX_512bit;
 7716         }
 7717         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7718         break;
 7719       case T_INT:
 7720         if (vlen == 8) {
 7721           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7722             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7723           }
 7724         } else if (vlen == 16) {
 7725           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7726         } else if (vlen == 32) {
 7727           if (UseAVX > 2) {
 7728             if (!VM_Version::supports_avx512vl()) {
 7729               vlen_enc = Assembler::AVX_512bit;
 7730             }
 7731             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7732           } else {
 7733             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7734             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7735           }
 7736         } else { // vlen == 64
 7737           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7738         }
 7739         break;
 7740       case T_FLOAT:
 7741         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7742         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7743         break;
 7744       case T_DOUBLE:
 7745         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7746         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7747         break;
 7748 
 7749       default: assert(false, "%s", type2name(to_elem_bt));
 7750     }
 7751   %}
 7752   ins_pipe( pipe_slow );
 7753 %}
 7754 
 7755 instruct vcastFtoD_reg(vec dst, vec src) %{
 7756   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7757   match(Set dst (VectorCastF2X src));
 7758   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7759   ins_encode %{
 7760     int vlen_enc = vector_length_encoding(this);
 7761     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7762   %}
 7763   ins_pipe( pipe_slow );
 7764 %}
 7765 
 7766 
 7767 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7768   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7769             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7770   match(Set dst (VectorCastF2X src));
 7771   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7772   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7773   ins_encode %{
 7774     int vlen_enc = vector_length_encoding(this, $src);
 7775     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7776     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7777     // 32 bit addresses for register indirect addressing mode since stub constants
 7778     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7779     // However, targets are free to increase this limit, but having a large code cache size
 7780     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7781     // cap we save a temporary register allocation which in limiting case can prevent
 7782     // spilling in high register pressure blocks.
 7783     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7784                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7785                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7786   %}
 7787   ins_pipe( pipe_slow );
 7788 %}
 7789 
 7790 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7791   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7792             is_integral_type(Matcher::vector_element_basic_type(n)));
 7793   match(Set dst (VectorCastF2X src));
 7794   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7795   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7796   ins_encode %{
 7797     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7798     if (to_elem_bt == T_LONG) {
 7799       int vlen_enc = vector_length_encoding(this);
 7800       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7801                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7802                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7803     } else {
 7804       int vlen_enc = vector_length_encoding(this, $src);
 7805       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7806                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7807                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7808     }
 7809   %}
 7810   ins_pipe( pipe_slow );
 7811 %}
 7812 
 7813 instruct vcastDtoF_reg(vec dst, vec src) %{
 7814   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7815   match(Set dst (VectorCastD2X src));
 7816   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7817   ins_encode %{
 7818     int vlen_enc = vector_length_encoding(this, $src);
 7819     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7820   %}
 7821   ins_pipe( pipe_slow );
 7822 %}
 7823 
 7824 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7825   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7826             is_integral_type(Matcher::vector_element_basic_type(n)));
 7827   match(Set dst (VectorCastD2X src));
 7828   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7829   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7830   ins_encode %{
 7831     int vlen_enc = vector_length_encoding(this, $src);
 7832     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7833     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7834                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7835                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7836   %}
 7837   ins_pipe( pipe_slow );
 7838 %}
 7839 
 7840 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7841   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7842             is_integral_type(Matcher::vector_element_basic_type(n)));
 7843   match(Set dst (VectorCastD2X src));
 7844   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7845   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7846   ins_encode %{
 7847     int vlen_enc = vector_length_encoding(this, $src);
 7848     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7849     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7850                               ExternalAddress(vector_float_signflip());
 7851     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7852                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7853   %}
 7854   ins_pipe( pipe_slow );
 7855 %}
 7856 
 7857 instruct vucast(vec dst, vec src) %{
 7858   match(Set dst (VectorUCastB2X src));
 7859   match(Set dst (VectorUCastS2X src));
 7860   match(Set dst (VectorUCastI2X src));
 7861   format %{ "vector_ucast $dst,$src\t!" %}
 7862   ins_encode %{
 7863     assert(UseAVX > 0, "required");
 7864 
 7865     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7866     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7867     int vlen_enc = vector_length_encoding(this);
 7868     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7869   %}
 7870   ins_pipe( pipe_slow );
 7871 %}
 7872 
 7873 #ifdef _LP64
 7874 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7875   predicate(!VM_Version::supports_avx512vl() &&
 7876             Matcher::vector_length_in_bytes(n) < 64 &&
 7877             Matcher::vector_element_basic_type(n) == T_INT);
 7878   match(Set dst (RoundVF src));
 7879   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7880   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7881   ins_encode %{
 7882     int vlen_enc = vector_length_encoding(this);
 7883     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7884     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7885                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7886                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7887   %}
 7888   ins_pipe( pipe_slow );
 7889 %}
 7890 
 7891 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7892   predicate((VM_Version::supports_avx512vl() ||
 7893              Matcher::vector_length_in_bytes(n) == 64) &&
 7894              Matcher::vector_element_basic_type(n) == T_INT);
 7895   match(Set dst (RoundVF src));
 7896   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7897   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7898   ins_encode %{
 7899     int vlen_enc = vector_length_encoding(this);
 7900     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7901     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7902                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7903                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7904   %}
 7905   ins_pipe( pipe_slow );
 7906 %}
 7907 
 7908 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7909   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7910   match(Set dst (RoundVD src));
 7911   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7912   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7913   ins_encode %{
 7914     int vlen_enc = vector_length_encoding(this);
 7915     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7916     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7917                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7918                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7919   %}
 7920   ins_pipe( pipe_slow );
 7921 %}
 7922 
 7923 #endif // _LP64
 7924 
 7925 // --------------------------------- VectorMaskCmp --------------------------------------
 7926 
 7927 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7928   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7929             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7930             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7931             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7932   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7933   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7934   ins_encode %{
 7935     int vlen_enc = vector_length_encoding(this, $src1);
 7936     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7937     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7938       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7939     } else {
 7940       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7941     }
 7942   %}
 7943   ins_pipe( pipe_slow );
 7944 %}
 7945 
 7946 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7947   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7948             n->bottom_type()->isa_vectmask() == nullptr &&
 7949             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7950   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7951   effect(TEMP ktmp);
 7952   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7953   ins_encode %{
 7954     int vlen_enc = Assembler::AVX_512bit;
 7955     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7956     KRegister mask = k0; // The comparison itself is not being masked.
 7957     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7958       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7959       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7960     } else {
 7961       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7962       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7963     }
 7964   %}
 7965   ins_pipe( pipe_slow );
 7966 %}
 7967 
 7968 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7969   predicate(n->bottom_type()->isa_vectmask() &&
 7970             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7971   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7972   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7973   ins_encode %{
 7974     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7975     int vlen_enc = vector_length_encoding(this, $src1);
 7976     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7977     KRegister mask = k0; // The comparison itself is not being masked.
 7978     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7979       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7980     } else {
 7981       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7982     }
 7983   %}
 7984   ins_pipe( pipe_slow );
 7985 %}
 7986 
 7987 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7988   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7989             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7990             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7991             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7992             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7993             (n->in(2)->get_int() == BoolTest::eq ||
 7994              n->in(2)->get_int() == BoolTest::lt ||
 7995              n->in(2)->get_int() == BoolTest::gt)); // cond
 7996   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7997   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7998   ins_encode %{
 7999     int vlen_enc = vector_length_encoding(this, $src1);
 8000     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8001     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8002     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 8003   %}
 8004   ins_pipe( pipe_slow );
 8005 %}
 8006 
 8007 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8008   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8009             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8010             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8011             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8012             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8013             (n->in(2)->get_int() == BoolTest::ne ||
 8014              n->in(2)->get_int() == BoolTest::le ||
 8015              n->in(2)->get_int() == BoolTest::ge)); // cond
 8016   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8017   effect(TEMP dst, TEMP xtmp);
 8018   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8019   ins_encode %{
 8020     int vlen_enc = vector_length_encoding(this, $src1);
 8021     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8022     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8023     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8024   %}
 8025   ins_pipe( pipe_slow );
 8026 %}
 8027 
 8028 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8029   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8030             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8031             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8032             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8033             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8034   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8035   effect(TEMP dst, TEMP xtmp);
 8036   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8037   ins_encode %{
 8038     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 8039     int vlen_enc = vector_length_encoding(this, $src1);
 8040     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8041     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8042 
 8043     if (vlen_enc == Assembler::AVX_128bit) {
 8044       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8045     } else {
 8046       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8047     }
 8048     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8049     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8050     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8051   %}
 8052   ins_pipe( pipe_slow );
 8053 %}
 8054 
 8055 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8056   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8057              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8058              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8059   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8060   effect(TEMP ktmp);
 8061   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8062   ins_encode %{
 8063     assert(UseAVX > 2, "required");
 8064 
 8065     int vlen_enc = vector_length_encoding(this, $src1);
 8066     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8067     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8068     KRegister mask = k0; // The comparison itself is not being masked.
 8069     bool merge = false;
 8070     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8071 
 8072     switch (src1_elem_bt) {
 8073       case T_INT: {
 8074         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8075         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8076         break;
 8077       }
 8078       case T_LONG: {
 8079         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8080         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8081         break;
 8082       }
 8083       default: assert(false, "%s", type2name(src1_elem_bt));
 8084     }
 8085   %}
 8086   ins_pipe( pipe_slow );
 8087 %}
 8088 
 8089 
 8090 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8091   predicate(n->bottom_type()->isa_vectmask() &&
 8092             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8093   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8094   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8095   ins_encode %{
 8096     assert(UseAVX > 2, "required");
 8097     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8098 
 8099     int vlen_enc = vector_length_encoding(this, $src1);
 8100     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8101     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8102     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8103 
 8104     // Comparison i
 8105     switch (src1_elem_bt) {
 8106       case T_BYTE: {
 8107         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8108         break;
 8109       }
 8110       case T_SHORT: {
 8111         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8112         break;
 8113       }
 8114       case T_INT: {
 8115         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8116         break;
 8117       }
 8118       case T_LONG: {
 8119         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8120         break;
 8121       }
 8122       default: assert(false, "%s", type2name(src1_elem_bt));
 8123     }
 8124   %}
 8125   ins_pipe( pipe_slow );
 8126 %}
 8127 
 8128 // Extract
 8129 
 8130 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8131   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8132   match(Set dst (ExtractI src idx));
 8133   match(Set dst (ExtractS src idx));
 8134 #ifdef _LP64
 8135   match(Set dst (ExtractB src idx));
 8136 #endif
 8137   format %{ "extractI $dst,$src,$idx\t!" %}
 8138   ins_encode %{
 8139     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8140 
 8141     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8142     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8143   %}
 8144   ins_pipe( pipe_slow );
 8145 %}
 8146 
 8147 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8148   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8149             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8150   match(Set dst (ExtractI src idx));
 8151   match(Set dst (ExtractS src idx));
 8152 #ifdef _LP64
 8153   match(Set dst (ExtractB src idx));
 8154 #endif
 8155   effect(TEMP vtmp);
 8156   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8157   ins_encode %{
 8158     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8159 
 8160     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8161     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8162     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8163   %}
 8164   ins_pipe( pipe_slow );
 8165 %}
 8166 
 8167 #ifdef _LP64
 8168 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8169   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8170   match(Set dst (ExtractL src idx));
 8171   format %{ "extractL $dst,$src,$idx\t!" %}
 8172   ins_encode %{
 8173     assert(UseSSE >= 4, "required");
 8174     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8175 
 8176     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8177   %}
 8178   ins_pipe( pipe_slow );
 8179 %}
 8180 
 8181 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8182   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8183             Matcher::vector_length(n->in(1)) == 8);  // src
 8184   match(Set dst (ExtractL src idx));
 8185   effect(TEMP vtmp);
 8186   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8187   ins_encode %{
 8188     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8189 
 8190     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8191     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8192   %}
 8193   ins_pipe( pipe_slow );
 8194 %}
 8195 #endif
 8196 
 8197 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8198   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8199   match(Set dst (ExtractF src idx));
 8200   effect(TEMP dst, TEMP vtmp);
 8201   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8202   ins_encode %{
 8203     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8204 
 8205     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8206   %}
 8207   ins_pipe( pipe_slow );
 8208 %}
 8209 
 8210 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8211   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8212             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8213   match(Set dst (ExtractF src idx));
 8214   effect(TEMP vtmp);
 8215   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8216   ins_encode %{
 8217     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8218 
 8219     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8220     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8221   %}
 8222   ins_pipe( pipe_slow );
 8223 %}
 8224 
 8225 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8226   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8227   match(Set dst (ExtractD src idx));
 8228   format %{ "extractD $dst,$src,$idx\t!" %}
 8229   ins_encode %{
 8230     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8231 
 8232     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8233   %}
 8234   ins_pipe( pipe_slow );
 8235 %}
 8236 
 8237 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8238   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8239             Matcher::vector_length(n->in(1)) == 8);  // src
 8240   match(Set dst (ExtractD src idx));
 8241   effect(TEMP vtmp);
 8242   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8243   ins_encode %{
 8244     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8245 
 8246     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8247     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8248   %}
 8249   ins_pipe( pipe_slow );
 8250 %}
 8251 
 8252 // --------------------------------- Vector Blend --------------------------------------
 8253 
 8254 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8255   predicate(UseAVX == 0);
 8256   match(Set dst (VectorBlend (Binary dst src) mask));
 8257   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8258   effect(TEMP tmp);
 8259   ins_encode %{
 8260     assert(UseSSE >= 4, "required");
 8261 
 8262     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8263       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8264     }
 8265     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8266   %}
 8267   ins_pipe( pipe_slow );
 8268 %}
 8269 
 8270 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8271   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8272             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8273             Matcher::vector_length_in_bytes(n) <= 32 &&
 8274             is_integral_type(Matcher::vector_element_basic_type(n)));
 8275   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8276   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8277   ins_encode %{
 8278     int vlen_enc = vector_length_encoding(this);
 8279     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8280   %}
 8281   ins_pipe( pipe_slow );
 8282 %}
 8283 
 8284 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8285   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8286             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8287             Matcher::vector_length_in_bytes(n) <= 32 &&
 8288             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8289   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8290   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8291   ins_encode %{
 8292     int vlen_enc = vector_length_encoding(this);
 8293     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8294   %}
 8295   ins_pipe( pipe_slow );
 8296 %}
 8297 
 8298 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8299   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8300             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8301             Matcher::vector_length_in_bytes(n) <= 32);
 8302   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8303   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8304   effect(TEMP vtmp, TEMP dst);
 8305   ins_encode %{
 8306     int vlen_enc = vector_length_encoding(this);
 8307     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8308     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8309     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8310   %}
 8311   ins_pipe( pipe_slow );
 8312 %}
 8313 
 8314 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8315   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8316             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8317   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8318   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8319   effect(TEMP ktmp);
 8320   ins_encode %{
 8321      int vlen_enc = Assembler::AVX_512bit;
 8322      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8323     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8324     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8325   %}
 8326   ins_pipe( pipe_slow );
 8327 %}
 8328 
 8329 
 8330 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8331   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8332             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8333              VM_Version::supports_avx512bw()));
 8334   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8335   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8336   ins_encode %{
 8337     int vlen_enc = vector_length_encoding(this);
 8338     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8339     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8340   %}
 8341   ins_pipe( pipe_slow );
 8342 %}
 8343 
 8344 // --------------------------------- ABS --------------------------------------
 8345 // a = |a|
 8346 instruct vabsB_reg(vec dst, vec src) %{
 8347   match(Set dst (AbsVB  src));
 8348   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8349   ins_encode %{
 8350     uint vlen = Matcher::vector_length(this);
 8351     if (vlen <= 16) {
 8352       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8353     } else {
 8354       int vlen_enc = vector_length_encoding(this);
 8355       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8356     }
 8357   %}
 8358   ins_pipe( pipe_slow );
 8359 %}
 8360 
 8361 instruct vabsS_reg(vec dst, vec src) %{
 8362   match(Set dst (AbsVS  src));
 8363   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8364   ins_encode %{
 8365     uint vlen = Matcher::vector_length(this);
 8366     if (vlen <= 8) {
 8367       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8368     } else {
 8369       int vlen_enc = vector_length_encoding(this);
 8370       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8371     }
 8372   %}
 8373   ins_pipe( pipe_slow );
 8374 %}
 8375 
 8376 instruct vabsI_reg(vec dst, vec src) %{
 8377   match(Set dst (AbsVI  src));
 8378   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8379   ins_encode %{
 8380     uint vlen = Matcher::vector_length(this);
 8381     if (vlen <= 4) {
 8382       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8383     } else {
 8384       int vlen_enc = vector_length_encoding(this);
 8385       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8386     }
 8387   %}
 8388   ins_pipe( pipe_slow );
 8389 %}
 8390 
 8391 instruct vabsL_reg(vec dst, vec src) %{
 8392   match(Set dst (AbsVL  src));
 8393   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8394   ins_encode %{
 8395     assert(UseAVX > 2, "required");
 8396     int vlen_enc = vector_length_encoding(this);
 8397     if (!VM_Version::supports_avx512vl()) {
 8398       vlen_enc = Assembler::AVX_512bit;
 8399     }
 8400     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8401   %}
 8402   ins_pipe( pipe_slow );
 8403 %}
 8404 
 8405 // --------------------------------- ABSNEG --------------------------------------
 8406 
 8407 instruct vabsnegF(vec dst, vec src) %{
 8408   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8409   match(Set dst (AbsVF src));
 8410   match(Set dst (NegVF src));
 8411   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8412   ins_cost(150);
 8413   ins_encode %{
 8414     int opcode = this->ideal_Opcode();
 8415     int vlen = Matcher::vector_length(this);
 8416     if (vlen == 2) {
 8417       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8418     } else {
 8419       assert(vlen == 8 || vlen == 16, "required");
 8420       int vlen_enc = vector_length_encoding(this);
 8421       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8422     }
 8423   %}
 8424   ins_pipe( pipe_slow );
 8425 %}
 8426 
 8427 instruct vabsneg4F(vec dst) %{
 8428   predicate(Matcher::vector_length(n) == 4);
 8429   match(Set dst (AbsVF dst));
 8430   match(Set dst (NegVF dst));
 8431   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8432   ins_cost(150);
 8433   ins_encode %{
 8434     int opcode = this->ideal_Opcode();
 8435     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8436   %}
 8437   ins_pipe( pipe_slow );
 8438 %}
 8439 
 8440 instruct vabsnegD(vec dst, vec src) %{
 8441   match(Set dst (AbsVD  src));
 8442   match(Set dst (NegVD  src));
 8443   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8444   ins_encode %{
 8445     int opcode = this->ideal_Opcode();
 8446     uint vlen = Matcher::vector_length(this);
 8447     if (vlen == 2) {
 8448       assert(UseSSE >= 2, "required");
 8449       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8450     } else {
 8451       int vlen_enc = vector_length_encoding(this);
 8452       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8453     }
 8454   %}
 8455   ins_pipe( pipe_slow );
 8456 %}
 8457 
 8458 //------------------------------------- VectorTest --------------------------------------------
 8459 
 8460 #ifdef _LP64
 8461 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8462   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8463   match(Set cr (VectorTest src1 src2));
 8464   effect(TEMP vtmp);
 8465   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8466   ins_encode %{
 8467     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8468     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8469     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8470   %}
 8471   ins_pipe( pipe_slow );
 8472 %}
 8473 
 8474 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8475   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8476   match(Set cr (VectorTest src1 src2));
 8477   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8478   ins_encode %{
 8479     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8480     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8481     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8482   %}
 8483   ins_pipe( pipe_slow );
 8484 %}
 8485 
 8486 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8487   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8488              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8489             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8490   match(Set cr (VectorTest src1 src2));
 8491   effect(TEMP tmp);
 8492   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8493   ins_encode %{
 8494     uint masklen = Matcher::vector_length(this, $src1);
 8495     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8496     __ andl($tmp$$Register, (1 << masklen) - 1);
 8497     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8498   %}
 8499   ins_pipe( pipe_slow );
 8500 %}
 8501 
 8502 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8503   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8504              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8505             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8506   match(Set cr (VectorTest src1 src2));
 8507   effect(TEMP tmp);
 8508   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8509   ins_encode %{
 8510     uint masklen = Matcher::vector_length(this, $src1);
 8511     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8512     __ andl($tmp$$Register, (1 << masklen) - 1);
 8513   %}
 8514   ins_pipe( pipe_slow );
 8515 %}
 8516 
 8517 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8518   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8519             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8520   match(Set cr (VectorTest src1 src2));
 8521   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8522   ins_encode %{
 8523     uint masklen = Matcher::vector_length(this, $src1);
 8524     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8525   %}
 8526   ins_pipe( pipe_slow );
 8527 %}
 8528 #endif
 8529 
 8530 //------------------------------------- LoadMask --------------------------------------------
 8531 
 8532 instruct loadMask(legVec dst, legVec src) %{
 8533   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8534   match(Set dst (VectorLoadMask src));
 8535   effect(TEMP dst);
 8536   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8537   ins_encode %{
 8538     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8539     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8540     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8541   %}
 8542   ins_pipe( pipe_slow );
 8543 %}
 8544 
 8545 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8546   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8547   match(Set dst (VectorLoadMask src));
 8548   effect(TEMP xtmp);
 8549   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8550   ins_encode %{
 8551     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8552                         true, Assembler::AVX_512bit);
 8553   %}
 8554   ins_pipe( pipe_slow );
 8555 %}
 8556 
 8557 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8558   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8559   match(Set dst (VectorLoadMask src));
 8560   effect(TEMP xtmp);
 8561   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8562   ins_encode %{
 8563     int vlen_enc = vector_length_encoding(in(1));
 8564     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8565                         false, vlen_enc);
 8566   %}
 8567   ins_pipe( pipe_slow );
 8568 %}
 8569 
 8570 //------------------------------------- StoreMask --------------------------------------------
 8571 
 8572 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8573   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8574   match(Set dst (VectorStoreMask src size));
 8575   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8576   ins_encode %{
 8577     int vlen = Matcher::vector_length(this);
 8578     if (vlen <= 16 && UseAVX <= 2) {
 8579       assert(UseSSE >= 3, "required");
 8580       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8581     } else {
 8582       assert(UseAVX > 0, "required");
 8583       int src_vlen_enc = vector_length_encoding(this, $src);
 8584       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8585     }
 8586   %}
 8587   ins_pipe( pipe_slow );
 8588 %}
 8589 
 8590 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8591   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8592   match(Set dst (VectorStoreMask src size));
 8593   effect(TEMP_DEF dst, TEMP xtmp);
 8594   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8595   ins_encode %{
 8596     int vlen_enc = Assembler::AVX_128bit;
 8597     int vlen = Matcher::vector_length(this);
 8598     if (vlen <= 8) {
 8599       assert(UseSSE >= 3, "required");
 8600       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8601       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8602       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8603     } else {
 8604       assert(UseAVX > 0, "required");
 8605       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8606       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8607       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8608     }
 8609   %}
 8610   ins_pipe( pipe_slow );
 8611 %}
 8612 
 8613 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8614   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8615   match(Set dst (VectorStoreMask src size));
 8616   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8617   effect(TEMP_DEF dst, TEMP xtmp);
 8618   ins_encode %{
 8619     int vlen_enc = Assembler::AVX_128bit;
 8620     int vlen = Matcher::vector_length(this);
 8621     if (vlen <= 4) {
 8622       assert(UseSSE >= 3, "required");
 8623       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8624       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8625       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8626       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8627     } else {
 8628       assert(UseAVX > 0, "required");
 8629       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8630       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8631       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8632       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8633       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8634     }
 8635   %}
 8636   ins_pipe( pipe_slow );
 8637 %}
 8638 
 8639 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8640   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8641   match(Set dst (VectorStoreMask src size));
 8642   effect(TEMP_DEF dst, TEMP xtmp);
 8643   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8644   ins_encode %{
 8645     assert(UseSSE >= 3, "required");
 8646     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8647     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8648     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8649     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8650     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8651   %}
 8652   ins_pipe( pipe_slow );
 8653 %}
 8654 
 8655 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8656   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8657   match(Set dst (VectorStoreMask src size));
 8658   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8659   effect(TEMP_DEF dst, TEMP vtmp);
 8660   ins_encode %{
 8661     int vlen_enc = Assembler::AVX_128bit;
 8662     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8663     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8664     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8665     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8666     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8667     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8668     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8669   %}
 8670   ins_pipe( pipe_slow );
 8671 %}
 8672 
 8673 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8674   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8675   match(Set dst (VectorStoreMask src size));
 8676   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8677   ins_encode %{
 8678     int src_vlen_enc = vector_length_encoding(this, $src);
 8679     int dst_vlen_enc = vector_length_encoding(this);
 8680     if (!VM_Version::supports_avx512vl()) {
 8681       src_vlen_enc = Assembler::AVX_512bit;
 8682     }
 8683     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8684     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8685   %}
 8686   ins_pipe( pipe_slow );
 8687 %}
 8688 
 8689 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8690   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8691   match(Set dst (VectorStoreMask src size));
 8692   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8693   ins_encode %{
 8694     int src_vlen_enc = vector_length_encoding(this, $src);
 8695     int dst_vlen_enc = vector_length_encoding(this);
 8696     if (!VM_Version::supports_avx512vl()) {
 8697       src_vlen_enc = Assembler::AVX_512bit;
 8698     }
 8699     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8700     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8701   %}
 8702   ins_pipe( pipe_slow );
 8703 %}
 8704 
 8705 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8706   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8707   match(Set dst (VectorStoreMask mask size));
 8708   effect(TEMP_DEF dst);
 8709   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8710   ins_encode %{
 8711     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8712     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8713                  false, Assembler::AVX_512bit, noreg);
 8714     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8715   %}
 8716   ins_pipe( pipe_slow );
 8717 %}
 8718 
 8719 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8720   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8721   match(Set dst (VectorStoreMask mask size));
 8722   effect(TEMP_DEF dst);
 8723   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8724   ins_encode %{
 8725     int dst_vlen_enc = vector_length_encoding(this);
 8726     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8727     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8728   %}
 8729   ins_pipe( pipe_slow );
 8730 %}
 8731 
 8732 instruct vmaskcast_evex(kReg dst) %{
 8733   match(Set dst (VectorMaskCast dst));
 8734   ins_cost(0);
 8735   format %{ "vector_mask_cast $dst" %}
 8736   ins_encode %{
 8737     // empty
 8738   %}
 8739   ins_pipe(empty);
 8740 %}
 8741 
 8742 instruct vmaskcast(vec dst) %{
 8743   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8744   match(Set dst (VectorMaskCast dst));
 8745   ins_cost(0);
 8746   format %{ "vector_mask_cast $dst" %}
 8747   ins_encode %{
 8748     // empty
 8749   %}
 8750   ins_pipe(empty);
 8751 %}
 8752 
 8753 instruct vmaskcast_avx(vec dst, vec src) %{
 8754   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8755   match(Set dst (VectorMaskCast src));
 8756   format %{ "vector_mask_cast $dst, $src" %}
 8757   ins_encode %{
 8758     int vlen = Matcher::vector_length(this);
 8759     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8760     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8761     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8762   %}
 8763   ins_pipe(pipe_slow);
 8764 %}
 8765 
 8766 //-------------------------------- Load Iota Indices ----------------------------------
 8767 
 8768 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8769   match(Set dst (VectorLoadConst src));
 8770   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8771   ins_encode %{
 8772      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8773      BasicType bt = Matcher::vector_element_basic_type(this);
 8774      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8775   %}
 8776   ins_pipe( pipe_slow );
 8777 %}
 8778 
 8779 #ifdef _LP64
 8780 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8781   match(Set dst (PopulateIndex src1 src2));
 8782   effect(TEMP dst, TEMP vtmp);
 8783   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8784   ins_encode %{
 8785      assert($src2$$constant == 1, "required");
 8786      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8787      int vlen_enc = vector_length_encoding(this);
 8788      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8789      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8790      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8791      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8792   %}
 8793   ins_pipe( pipe_slow );
 8794 %}
 8795 
 8796 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8797   match(Set dst (PopulateIndex src1 src2));
 8798   effect(TEMP dst, TEMP vtmp);
 8799   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8800   ins_encode %{
 8801      assert($src2$$constant == 1, "required");
 8802      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8803      int vlen_enc = vector_length_encoding(this);
 8804      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8805      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8806      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8807      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8808   %}
 8809   ins_pipe( pipe_slow );
 8810 %}
 8811 #endif
 8812 //-------------------------------- Rearrange ----------------------------------
 8813 
 8814 // LoadShuffle/Rearrange for Byte
 8815 
 8816 instruct loadShuffleB(vec dst) %{
 8817   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8818   match(Set dst (VectorLoadShuffle dst));
 8819   format %{ "vector_load_shuffle $dst, $dst" %}
 8820   ins_encode %{
 8821     // empty
 8822   %}
 8823   ins_pipe( pipe_slow );
 8824 %}
 8825 
 8826 instruct rearrangeB(vec dst, vec shuffle) %{
 8827   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8828             Matcher::vector_length(n) < 32);
 8829   match(Set dst (VectorRearrange dst shuffle));
 8830   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8831   ins_encode %{
 8832     assert(UseSSE >= 4, "required");
 8833     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8834   %}
 8835   ins_pipe( pipe_slow );
 8836 %}
 8837 
 8838 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8839   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8840             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8841   match(Set dst (VectorRearrange src shuffle));
 8842   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8843   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8844   ins_encode %{
 8845     assert(UseAVX >= 2, "required");
 8846     // Swap src into vtmp1
 8847     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8848     // Shuffle swapped src to get entries from other 128 bit lane
 8849     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8850     // Shuffle original src to get entries from self 128 bit lane
 8851     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8852     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8853     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8854     // Perform the blend
 8855     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8856   %}
 8857   ins_pipe( pipe_slow );
 8858 %}
 8859 
 8860 
 8861 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8862   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8863             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8864   match(Set dst (VectorRearrange src shuffle));
 8865   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8866   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8867   ins_encode %{
 8868     int vlen_enc = vector_length_encoding(this);
 8869     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8870                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8871                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8872   %}
 8873   ins_pipe( pipe_slow );
 8874 %}
 8875 
 8876 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8877   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8878             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8879   match(Set dst (VectorRearrange src shuffle));
 8880   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8881   ins_encode %{
 8882     int vlen_enc = vector_length_encoding(this);
 8883     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8884   %}
 8885   ins_pipe( pipe_slow );
 8886 %}
 8887 
 8888 // LoadShuffle/Rearrange for Short
 8889 
 8890 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8891   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8892             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8893   match(Set dst (VectorLoadShuffle src));
 8894   effect(TEMP dst, TEMP vtmp);
 8895   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8896   ins_encode %{
 8897     // Create a byte shuffle mask from short shuffle mask
 8898     // only byte shuffle instruction available on these platforms
 8899     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8900     if (UseAVX == 0) {
 8901       assert(vlen_in_bytes <= 16, "required");
 8902       // Multiply each shuffle by two to get byte index
 8903       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8904       __ psllw($vtmp$$XMMRegister, 1);
 8905 
 8906       // Duplicate to create 2 copies of byte index
 8907       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8908       __ psllw($dst$$XMMRegister, 8);
 8909       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8910 
 8911       // Add one to get alternate byte index
 8912       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8913       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8914     } else {
 8915       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8916       int vlen_enc = vector_length_encoding(this);
 8917       // Multiply each shuffle by two to get byte index
 8918       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8919       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8920 
 8921       // Duplicate to create 2 copies of byte index
 8922       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8923       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8924 
 8925       // Add one to get alternate byte index
 8926       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8927     }
 8928   %}
 8929   ins_pipe( pipe_slow );
 8930 %}
 8931 
 8932 instruct rearrangeS(vec dst, vec shuffle) %{
 8933   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8934             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8935   match(Set dst (VectorRearrange dst shuffle));
 8936   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8937   ins_encode %{
 8938     assert(UseSSE >= 4, "required");
 8939     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8940   %}
 8941   ins_pipe( pipe_slow );
 8942 %}
 8943 
 8944 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8945   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8946             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8947   match(Set dst (VectorRearrange src shuffle));
 8948   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8949   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8950   ins_encode %{
 8951     assert(UseAVX >= 2, "required");
 8952     // Swap src into vtmp1
 8953     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8954     // Shuffle swapped src to get entries from other 128 bit lane
 8955     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8956     // Shuffle original src to get entries from self 128 bit lane
 8957     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8958     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8959     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8960     // Perform the blend
 8961     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8962   %}
 8963   ins_pipe( pipe_slow );
 8964 %}
 8965 
 8966 instruct loadShuffleS_evex(vec dst, vec src) %{
 8967   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8968             VM_Version::supports_avx512bw());
 8969   match(Set dst (VectorLoadShuffle src));
 8970   format %{ "vector_load_shuffle $dst, $src" %}
 8971   ins_encode %{
 8972     int vlen_enc = vector_length_encoding(this);
 8973     if (!VM_Version::supports_avx512vl()) {
 8974       vlen_enc = Assembler::AVX_512bit;
 8975     }
 8976     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8977   %}
 8978   ins_pipe( pipe_slow );
 8979 %}
 8980 
 8981 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8982   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8983             VM_Version::supports_avx512bw());
 8984   match(Set dst (VectorRearrange src shuffle));
 8985   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8986   ins_encode %{
 8987     int vlen_enc = vector_length_encoding(this);
 8988     if (!VM_Version::supports_avx512vl()) {
 8989       vlen_enc = Assembler::AVX_512bit;
 8990     }
 8991     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8992   %}
 8993   ins_pipe( pipe_slow );
 8994 %}
 8995 
 8996 // LoadShuffle/Rearrange for Integer and Float
 8997 
 8998 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8999   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9000             Matcher::vector_length(n) == 4 && UseAVX == 0);
 9001   match(Set dst (VectorLoadShuffle src));
 9002   effect(TEMP dst, TEMP vtmp);
 9003   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9004   ins_encode %{
 9005     assert(UseSSE >= 4, "required");
 9006 
 9007     // Create a byte shuffle mask from int shuffle mask
 9008     // only byte shuffle instruction available on these platforms
 9009 
 9010     // Duplicate and multiply each shuffle by 4
 9011     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 9012     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9013     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9014     __ psllw($vtmp$$XMMRegister, 2);
 9015 
 9016     // Duplicate again to create 4 copies of byte index
 9017     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 9018     __ psllw($dst$$XMMRegister, 8);
 9019     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 9020 
 9021     // Add 3,2,1,0 to get alternate byte index
 9022     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 9023     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 9024   %}
 9025   ins_pipe( pipe_slow );
 9026 %}
 9027 
 9028 instruct rearrangeI(vec dst, vec shuffle) %{
 9029   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9030             UseAVX == 0);
 9031   match(Set dst (VectorRearrange dst shuffle));
 9032   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9033   ins_encode %{
 9034     assert(UseSSE >= 4, "required");
 9035     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9036   %}
 9037   ins_pipe( pipe_slow );
 9038 %}
 9039 
 9040 instruct loadShuffleI_avx(vec dst, vec src) %{
 9041   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9042             UseAVX > 0);
 9043   match(Set dst (VectorLoadShuffle src));
 9044   format %{ "vector_load_shuffle $dst, $src" %}
 9045   ins_encode %{
 9046     int vlen_enc = vector_length_encoding(this);
 9047     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9048   %}
 9049   ins_pipe( pipe_slow );
 9050 %}
 9051 
 9052 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 9053   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9054             UseAVX > 0);
 9055   match(Set dst (VectorRearrange src shuffle));
 9056   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9057   ins_encode %{
 9058     int vlen_enc = vector_length_encoding(this);
 9059     BasicType bt = Matcher::vector_element_basic_type(this);
 9060     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9061   %}
 9062   ins_pipe( pipe_slow );
 9063 %}
 9064 
 9065 // LoadShuffle/Rearrange for Long and Double
 9066 
 9067 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 9068   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9069             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9070   match(Set dst (VectorLoadShuffle src));
 9071   effect(TEMP dst, TEMP vtmp);
 9072   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9073   ins_encode %{
 9074     assert(UseAVX >= 2, "required");
 9075 
 9076     int vlen_enc = vector_length_encoding(this);
 9077     // Create a double word shuffle mask from long shuffle mask
 9078     // only double word shuffle instruction available on these platforms
 9079 
 9080     // Multiply each shuffle by two to get double word index
 9081     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9082     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 9083 
 9084     // Duplicate each double word shuffle
 9085     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9086     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9087 
 9088     // Add one to get alternate double word index
 9089     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9090   %}
 9091   ins_pipe( pipe_slow );
 9092 %}
 9093 
 9094 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9095   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9096             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9097   match(Set dst (VectorRearrange src shuffle));
 9098   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9099   ins_encode %{
 9100     assert(UseAVX >= 2, "required");
 9101 
 9102     int vlen_enc = vector_length_encoding(this);
 9103     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9104   %}
 9105   ins_pipe( pipe_slow );
 9106 %}
 9107 
 9108 instruct loadShuffleL_evex(vec dst, vec src) %{
 9109   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9110             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9111   match(Set dst (VectorLoadShuffle src));
 9112   format %{ "vector_load_shuffle $dst, $src" %}
 9113   ins_encode %{
 9114     assert(UseAVX > 2, "required");
 9115 
 9116     int vlen_enc = vector_length_encoding(this);
 9117     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9118   %}
 9119   ins_pipe( pipe_slow );
 9120 %}
 9121 
 9122 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9123   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9124             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9125   match(Set dst (VectorRearrange src shuffle));
 9126   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9127   ins_encode %{
 9128     assert(UseAVX > 2, "required");
 9129 
 9130     int vlen_enc = vector_length_encoding(this);
 9131     if (vlen_enc == Assembler::AVX_128bit) {
 9132       vlen_enc = Assembler::AVX_256bit;
 9133     }
 9134     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9135   %}
 9136   ins_pipe( pipe_slow );
 9137 %}
 9138 
 9139 // --------------------------------- FMA --------------------------------------
 9140 // a * b + c
 9141 
 9142 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9143   match(Set c (FmaVF  c (Binary a b)));
 9144   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9145   ins_cost(150);
 9146   ins_encode %{
 9147     assert(UseFMA, "not enabled");
 9148     int vlen_enc = vector_length_encoding(this);
 9149     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9150   %}
 9151   ins_pipe( pipe_slow );
 9152 %}
 9153 
 9154 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9155   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9156   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9157   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9158   ins_cost(150);
 9159   ins_encode %{
 9160     assert(UseFMA, "not enabled");
 9161     int vlen_enc = vector_length_encoding(this);
 9162     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9163   %}
 9164   ins_pipe( pipe_slow );
 9165 %}
 9166 
 9167 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9168   match(Set c (FmaVD  c (Binary a b)));
 9169   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9170   ins_cost(150);
 9171   ins_encode %{
 9172     assert(UseFMA, "not enabled");
 9173     int vlen_enc = vector_length_encoding(this);
 9174     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9175   %}
 9176   ins_pipe( pipe_slow );
 9177 %}
 9178 
 9179 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9180   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9181   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9182   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9183   ins_cost(150);
 9184   ins_encode %{
 9185     assert(UseFMA, "not enabled");
 9186     int vlen_enc = vector_length_encoding(this);
 9187     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9188   %}
 9189   ins_pipe( pipe_slow );
 9190 %}
 9191 
 9192 // --------------------------------- Vector Multiply Add --------------------------------------
 9193 
 9194 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9195   predicate(UseAVX == 0);
 9196   match(Set dst (MulAddVS2VI dst src1));
 9197   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9198   ins_encode %{
 9199     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9200   %}
 9201   ins_pipe( pipe_slow );
 9202 %}
 9203 
 9204 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9205   predicate(UseAVX > 0);
 9206   match(Set dst (MulAddVS2VI src1 src2));
 9207   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9208   ins_encode %{
 9209     int vlen_enc = vector_length_encoding(this);
 9210     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9211   %}
 9212   ins_pipe( pipe_slow );
 9213 %}
 9214 
 9215 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9216 
 9217 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9218   predicate(VM_Version::supports_avx512_vnni());
 9219   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9220   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9221   ins_encode %{
 9222     assert(UseAVX > 2, "required");
 9223     int vlen_enc = vector_length_encoding(this);
 9224     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9225   %}
 9226   ins_pipe( pipe_slow );
 9227   ins_cost(10);
 9228 %}
 9229 
 9230 // --------------------------------- PopCount --------------------------------------
 9231 
 9232 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9233   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9234   match(Set dst (PopCountVI src));
 9235   match(Set dst (PopCountVL src));
 9236   format %{ "vector_popcount_integral $dst, $src" %}
 9237   ins_encode %{
 9238     int opcode = this->ideal_Opcode();
 9239     int vlen_enc = vector_length_encoding(this, $src);
 9240     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9241     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9242   %}
 9243   ins_pipe( pipe_slow );
 9244 %}
 9245 
 9246 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9247   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9248   match(Set dst (PopCountVI src mask));
 9249   match(Set dst (PopCountVL src mask));
 9250   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9251   ins_encode %{
 9252     int vlen_enc = vector_length_encoding(this, $src);
 9253     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9254     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9255     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9256   %}
 9257   ins_pipe( pipe_slow );
 9258 %}
 9259 
 9260 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9261   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9262   match(Set dst (PopCountVI src));
 9263   match(Set dst (PopCountVL src));
 9264   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9265   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9266   ins_encode %{
 9267     int opcode = this->ideal_Opcode();
 9268     int vlen_enc = vector_length_encoding(this, $src);
 9269     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9270     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9271                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9272   %}
 9273   ins_pipe( pipe_slow );
 9274 %}
 9275 
 9276 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9277 
 9278 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9279   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9280                                               Matcher::vector_length_in_bytes(n->in(1))));
 9281   match(Set dst (CountTrailingZerosV src));
 9282   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9283   ins_cost(400);
 9284   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9285   ins_encode %{
 9286     int vlen_enc = vector_length_encoding(this, $src);
 9287     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9288     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9289                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9290   %}
 9291   ins_pipe( pipe_slow );
 9292 %}
 9293 
 9294 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9295   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9296             VM_Version::supports_avx512cd() &&
 9297             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9298   match(Set dst (CountTrailingZerosV src));
 9299   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9300   ins_cost(400);
 9301   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9302   ins_encode %{
 9303     int vlen_enc = vector_length_encoding(this, $src);
 9304     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9305     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9306                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9307   %}
 9308   ins_pipe( pipe_slow );
 9309 %}
 9310 
 9311 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9312   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9313   match(Set dst (CountTrailingZerosV src));
 9314   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9315   ins_cost(400);
 9316   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9317   ins_encode %{
 9318     int vlen_enc = vector_length_encoding(this, $src);
 9319     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9320     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9321                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9322                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9323   %}
 9324   ins_pipe( pipe_slow );
 9325 %}
 9326 
 9327 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9328   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9329   match(Set dst (CountTrailingZerosV src));
 9330   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9331   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9332   ins_encode %{
 9333     int vlen_enc = vector_length_encoding(this, $src);
 9334     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9335     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9336                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9337   %}
 9338   ins_pipe( pipe_slow );
 9339 %}
 9340 
 9341 
 9342 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9343 
 9344 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9345   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9346   effect(TEMP dst);
 9347   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9348   ins_encode %{
 9349     int vector_len = vector_length_encoding(this);
 9350     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9351   %}
 9352   ins_pipe( pipe_slow );
 9353 %}
 9354 
 9355 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9356   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9357   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9358   effect(TEMP dst);
 9359   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9360   ins_encode %{
 9361     int vector_len = vector_length_encoding(this);
 9362     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9363   %}
 9364   ins_pipe( pipe_slow );
 9365 %}
 9366 
 9367 // --------------------------------- Rotation Operations ----------------------------------
 9368 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9369   match(Set dst (RotateLeftV src shift));
 9370   match(Set dst (RotateRightV src shift));
 9371   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9372   ins_encode %{
 9373     int opcode      = this->ideal_Opcode();
 9374     int vector_len  = vector_length_encoding(this);
 9375     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9376     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9377   %}
 9378   ins_pipe( pipe_slow );
 9379 %}
 9380 
 9381 instruct vprorate(vec dst, vec src, vec shift) %{
 9382   match(Set dst (RotateLeftV src shift));
 9383   match(Set dst (RotateRightV src shift));
 9384   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9385   ins_encode %{
 9386     int opcode      = this->ideal_Opcode();
 9387     int vector_len  = vector_length_encoding(this);
 9388     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9389     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9390   %}
 9391   ins_pipe( pipe_slow );
 9392 %}
 9393 
 9394 // ---------------------------------- Masked Operations ------------------------------------
 9395 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9396   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9397   match(Set dst (LoadVectorMasked mem mask));
 9398   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9399   ins_encode %{
 9400     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9401     int vlen_enc = vector_length_encoding(this);
 9402     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9403   %}
 9404   ins_pipe( pipe_slow );
 9405 %}
 9406 
 9407 
 9408 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9409   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9410   match(Set dst (LoadVectorMasked mem mask));
 9411   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9412   ins_encode %{
 9413     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9414     int vector_len = vector_length_encoding(this);
 9415     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9416   %}
 9417   ins_pipe( pipe_slow );
 9418 %}
 9419 
 9420 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9421   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9422   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9423   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9424   ins_encode %{
 9425     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9426     int vlen_enc = vector_length_encoding(src_node);
 9427     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9428     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9429   %}
 9430   ins_pipe( pipe_slow );
 9431 %}
 9432 
 9433 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9434   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9435   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9436   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9437   ins_encode %{
 9438     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9439     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9440     int vlen_enc = vector_length_encoding(src_node);
 9441     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9442   %}
 9443   ins_pipe( pipe_slow );
 9444 %}
 9445 
 9446 #ifdef _LP64
 9447 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9448   match(Set addr (VerifyVectorAlignment addr mask));
 9449   effect(KILL cr);
 9450   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9451   ins_encode %{
 9452     Label Lskip;
 9453     // check if masked bits of addr are zero
 9454     __ testq($addr$$Register, $mask$$constant);
 9455     __ jccb(Assembler::equal, Lskip);
 9456     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9457     __ bind(Lskip);
 9458   %}
 9459   ins_pipe(pipe_slow);
 9460 %}
 9461 
 9462 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9463   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9464   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9465   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9466   ins_encode %{
 9467     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9468     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9469 
 9470     Label DONE;
 9471     int vlen_enc = vector_length_encoding(this, $src1);
 9472     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9473 
 9474     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9475     __ mov64($dst$$Register, -1L);
 9476     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9477     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9478     __ jccb(Assembler::carrySet, DONE);
 9479     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9480     __ notq($dst$$Register);
 9481     __ tzcntq($dst$$Register, $dst$$Register);
 9482     __ bind(DONE);
 9483   %}
 9484   ins_pipe( pipe_slow );
 9485 %}
 9486 
 9487 
 9488 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9489   match(Set dst (VectorMaskGen len));
 9490   effect(TEMP temp, KILL cr);
 9491   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9492   ins_encode %{
 9493     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9494   %}
 9495   ins_pipe( pipe_slow );
 9496 %}
 9497 
 9498 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9499   match(Set dst (VectorMaskGen len));
 9500   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9501   effect(TEMP temp);
 9502   ins_encode %{
 9503     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9504     __ kmovql($dst$$KRegister, $temp$$Register);
 9505   %}
 9506   ins_pipe( pipe_slow );
 9507 %}
 9508 
 9509 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9510   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9511   match(Set dst (VectorMaskToLong mask));
 9512   effect(TEMP dst, KILL cr);
 9513   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9514   ins_encode %{
 9515     int opcode = this->ideal_Opcode();
 9516     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9517     int mask_len = Matcher::vector_length(this, $mask);
 9518     int mask_size = mask_len * type2aelembytes(mbt);
 9519     int vlen_enc = vector_length_encoding(this, $mask);
 9520     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9521                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9522   %}
 9523   ins_pipe( pipe_slow );
 9524 %}
 9525 
 9526 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9527   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9528   match(Set dst (VectorMaskToLong mask));
 9529   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9530   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9531   ins_encode %{
 9532     int opcode = this->ideal_Opcode();
 9533     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9534     int mask_len = Matcher::vector_length(this, $mask);
 9535     int vlen_enc = vector_length_encoding(this, $mask);
 9536     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9537                              $dst$$Register, mask_len, mbt, vlen_enc);
 9538   %}
 9539   ins_pipe( pipe_slow );
 9540 %}
 9541 
 9542 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9543   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9544   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9545   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9546   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9547   ins_encode %{
 9548     int opcode = this->ideal_Opcode();
 9549     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9550     int mask_len = Matcher::vector_length(this, $mask);
 9551     int vlen_enc = vector_length_encoding(this, $mask);
 9552     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9553                              $dst$$Register, mask_len, mbt, vlen_enc);
 9554   %}
 9555   ins_pipe( pipe_slow );
 9556 %}
 9557 
 9558 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9559   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9560   match(Set dst (VectorMaskTrueCount mask));
 9561   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9562   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9563   ins_encode %{
 9564     int opcode = this->ideal_Opcode();
 9565     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9566     int mask_len = Matcher::vector_length(this, $mask);
 9567     int mask_size = mask_len * type2aelembytes(mbt);
 9568     int vlen_enc = vector_length_encoding(this, $mask);
 9569     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9570                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9571   %}
 9572   ins_pipe( pipe_slow );
 9573 %}
 9574 
 9575 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9576   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9577   match(Set dst (VectorMaskTrueCount mask));
 9578   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9579   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9580   ins_encode %{
 9581     int opcode = this->ideal_Opcode();
 9582     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9583     int mask_len = Matcher::vector_length(this, $mask);
 9584     int vlen_enc = vector_length_encoding(this, $mask);
 9585     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9586                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9587   %}
 9588   ins_pipe( pipe_slow );
 9589 %}
 9590 
 9591 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9592   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9593   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9594   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9595   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9596   ins_encode %{
 9597     int opcode = this->ideal_Opcode();
 9598     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9599     int mask_len = Matcher::vector_length(this, $mask);
 9600     int vlen_enc = vector_length_encoding(this, $mask);
 9601     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9602                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9603   %}
 9604   ins_pipe( pipe_slow );
 9605 %}
 9606 
 9607 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9608   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9609   match(Set dst (VectorMaskFirstTrue mask));
 9610   match(Set dst (VectorMaskLastTrue mask));
 9611   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9612   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9613   ins_encode %{
 9614     int opcode = this->ideal_Opcode();
 9615     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9616     int mask_len = Matcher::vector_length(this, $mask);
 9617     int mask_size = mask_len * type2aelembytes(mbt);
 9618     int vlen_enc = vector_length_encoding(this, $mask);
 9619     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9620                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9621   %}
 9622   ins_pipe( pipe_slow );
 9623 %}
 9624 
 9625 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9626   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9627   match(Set dst (VectorMaskFirstTrue mask));
 9628   match(Set dst (VectorMaskLastTrue mask));
 9629   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9630   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9631   ins_encode %{
 9632     int opcode = this->ideal_Opcode();
 9633     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9634     int mask_len = Matcher::vector_length(this, $mask);
 9635     int vlen_enc = vector_length_encoding(this, $mask);
 9636     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9637                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9638   %}
 9639   ins_pipe( pipe_slow );
 9640 %}
 9641 
 9642 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9643   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9644   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9645   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9646   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9647   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9648   ins_encode %{
 9649     int opcode = this->ideal_Opcode();
 9650     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9651     int mask_len = Matcher::vector_length(this, $mask);
 9652     int vlen_enc = vector_length_encoding(this, $mask);
 9653     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9654                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9655   %}
 9656   ins_pipe( pipe_slow );
 9657 %}
 9658 
 9659 // --------------------------------- Compress/Expand Operations ---------------------------
 9660 #ifdef _LP64
 9661 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9662   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9663   match(Set dst (CompressV src mask));
 9664   match(Set dst (ExpandV src mask));
 9665   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9666   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9667   ins_encode %{
 9668     int opcode = this->ideal_Opcode();
 9669     int vlen_enc = vector_length_encoding(this);
 9670     BasicType bt  = Matcher::vector_element_basic_type(this);
 9671     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9672                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9673   %}
 9674   ins_pipe( pipe_slow );
 9675 %}
 9676 #endif
 9677 
 9678 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9679   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9680   match(Set dst (CompressV src mask));
 9681   match(Set dst (ExpandV src mask));
 9682   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9683   ins_encode %{
 9684     int opcode = this->ideal_Opcode();
 9685     int vector_len = vector_length_encoding(this);
 9686     BasicType bt  = Matcher::vector_element_basic_type(this);
 9687     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9688   %}
 9689   ins_pipe( pipe_slow );
 9690 %}
 9691 
 9692 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9693   match(Set dst (CompressM mask));
 9694   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9695   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9696   ins_encode %{
 9697     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9698     int mask_len = Matcher::vector_length(this);
 9699     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9700   %}
 9701   ins_pipe( pipe_slow );
 9702 %}
 9703 
 9704 #endif // _LP64
 9705 
 9706 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9707 
 9708 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9709   predicate(!VM_Version::supports_gfni());
 9710   match(Set dst (ReverseV src));
 9711   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9712   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9713   ins_encode %{
 9714     int vec_enc = vector_length_encoding(this);
 9715     BasicType bt = Matcher::vector_element_basic_type(this);
 9716     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9717                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9718   %}
 9719   ins_pipe( pipe_slow );
 9720 %}
 9721 
 9722 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9723   predicate(VM_Version::supports_gfni());
 9724   match(Set dst (ReverseV src));
 9725   effect(TEMP dst, TEMP xtmp);
 9726   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9727   ins_encode %{
 9728     int vec_enc = vector_length_encoding(this);
 9729     BasicType bt  = Matcher::vector_element_basic_type(this);
 9730     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9731     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9732                                $xtmp$$XMMRegister);
 9733   %}
 9734   ins_pipe( pipe_slow );
 9735 %}
 9736 
 9737 instruct vreverse_byte_reg(vec dst, vec src) %{
 9738   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9739   match(Set dst (ReverseBytesV src));
 9740   effect(TEMP dst);
 9741   format %{ "vector_reverse_byte $dst, $src" %}
 9742   ins_encode %{
 9743     int vec_enc = vector_length_encoding(this);
 9744     BasicType bt = Matcher::vector_element_basic_type(this);
 9745     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9746   %}
 9747   ins_pipe( pipe_slow );
 9748 %}
 9749 
 9750 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9751   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9752   match(Set dst (ReverseBytesV src));
 9753   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9754   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9755   ins_encode %{
 9756     int vec_enc = vector_length_encoding(this);
 9757     BasicType bt = Matcher::vector_element_basic_type(this);
 9758     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9759                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9760   %}
 9761   ins_pipe( pipe_slow );
 9762 %}
 9763 
 9764 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9765 
 9766 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9767   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9768                                               Matcher::vector_length_in_bytes(n->in(1))));
 9769   match(Set dst (CountLeadingZerosV src));
 9770   format %{ "vector_count_leading_zeros $dst, $src" %}
 9771   ins_encode %{
 9772      int vlen_enc = vector_length_encoding(this, $src);
 9773      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9774      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9775                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9776   %}
 9777   ins_pipe( pipe_slow );
 9778 %}
 9779 
 9780 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9781   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9782                                               Matcher::vector_length_in_bytes(n->in(1))));
 9783   match(Set dst (CountLeadingZerosV src mask));
 9784   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9785   ins_encode %{
 9786     int vlen_enc = vector_length_encoding(this, $src);
 9787     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9788     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9789     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9790                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9791   %}
 9792   ins_pipe( pipe_slow );
 9793 %}
 9794 
 9795 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9796   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9797             VM_Version::supports_avx512cd() &&
 9798             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9799   match(Set dst (CountLeadingZerosV src));
 9800   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9801   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9802   ins_encode %{
 9803     int vlen_enc = vector_length_encoding(this, $src);
 9804     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9805     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9806                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9807   %}
 9808   ins_pipe( pipe_slow );
 9809 %}
 9810 
 9811 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9812   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9813   match(Set dst (CountLeadingZerosV src));
 9814   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9815   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9816   ins_encode %{
 9817     int vlen_enc = vector_length_encoding(this, $src);
 9818     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9819     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9820                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9821                                        $rtmp$$Register, true, vlen_enc);
 9822   %}
 9823   ins_pipe( pipe_slow );
 9824 %}
 9825 
 9826 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9827   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9828             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9829   match(Set dst (CountLeadingZerosV src));
 9830   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9831   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9832   ins_encode %{
 9833     int vlen_enc = vector_length_encoding(this, $src);
 9834     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9835     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9836                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9837   %}
 9838   ins_pipe( pipe_slow );
 9839 %}
 9840 
 9841 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9842   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9843             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9844   match(Set dst (CountLeadingZerosV src));
 9845   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9846   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9847   ins_encode %{
 9848     int vlen_enc = vector_length_encoding(this, $src);
 9849     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9850     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9851                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9852   %}
 9853   ins_pipe( pipe_slow );
 9854 %}
 9855 
 9856 // ---------------------------------- Vector Masked Operations ------------------------------------
 9857 
 9858 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9859   match(Set dst (AddVB (Binary dst src2) mask));
 9860   match(Set dst (AddVS (Binary dst src2) mask));
 9861   match(Set dst (AddVI (Binary dst src2) mask));
 9862   match(Set dst (AddVL (Binary dst src2) mask));
 9863   match(Set dst (AddVF (Binary dst src2) mask));
 9864   match(Set dst (AddVD (Binary dst src2) mask));
 9865   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9866   ins_encode %{
 9867     int vlen_enc = vector_length_encoding(this);
 9868     BasicType bt = Matcher::vector_element_basic_type(this);
 9869     int opc = this->ideal_Opcode();
 9870     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9871                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9872   %}
 9873   ins_pipe( pipe_slow );
 9874 %}
 9875 
 9876 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9877   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9878   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9879   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9880   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9881   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9882   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9883   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9884   ins_encode %{
 9885     int vlen_enc = vector_length_encoding(this);
 9886     BasicType bt = Matcher::vector_element_basic_type(this);
 9887     int opc = this->ideal_Opcode();
 9888     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9889                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9890   %}
 9891   ins_pipe( pipe_slow );
 9892 %}
 9893 
 9894 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9895   match(Set dst (XorV (Binary dst src2) mask));
 9896   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9897   ins_encode %{
 9898     int vlen_enc = vector_length_encoding(this);
 9899     BasicType bt = Matcher::vector_element_basic_type(this);
 9900     int opc = this->ideal_Opcode();
 9901     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9902                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9903   %}
 9904   ins_pipe( pipe_slow );
 9905 %}
 9906 
 9907 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9908   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9909   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9910   ins_encode %{
 9911     int vlen_enc = vector_length_encoding(this);
 9912     BasicType bt = Matcher::vector_element_basic_type(this);
 9913     int opc = this->ideal_Opcode();
 9914     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9915                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9916   %}
 9917   ins_pipe( pipe_slow );
 9918 %}
 9919 
 9920 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9921   match(Set dst (OrV (Binary dst src2) mask));
 9922   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9923   ins_encode %{
 9924     int vlen_enc = vector_length_encoding(this);
 9925     BasicType bt = Matcher::vector_element_basic_type(this);
 9926     int opc = this->ideal_Opcode();
 9927     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9928                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9929   %}
 9930   ins_pipe( pipe_slow );
 9931 %}
 9932 
 9933 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9934   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9935   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9936   ins_encode %{
 9937     int vlen_enc = vector_length_encoding(this);
 9938     BasicType bt = Matcher::vector_element_basic_type(this);
 9939     int opc = this->ideal_Opcode();
 9940     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9941                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9942   %}
 9943   ins_pipe( pipe_slow );
 9944 %}
 9945 
 9946 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9947   match(Set dst (AndV (Binary dst src2) mask));
 9948   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9949   ins_encode %{
 9950     int vlen_enc = vector_length_encoding(this);
 9951     BasicType bt = Matcher::vector_element_basic_type(this);
 9952     int opc = this->ideal_Opcode();
 9953     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9954                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9955   %}
 9956   ins_pipe( pipe_slow );
 9957 %}
 9958 
 9959 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9960   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9961   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9962   ins_encode %{
 9963     int vlen_enc = vector_length_encoding(this);
 9964     BasicType bt = Matcher::vector_element_basic_type(this);
 9965     int opc = this->ideal_Opcode();
 9966     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9967                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9968   %}
 9969   ins_pipe( pipe_slow );
 9970 %}
 9971 
 9972 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9973   match(Set dst (SubVB (Binary dst src2) mask));
 9974   match(Set dst (SubVS (Binary dst src2) mask));
 9975   match(Set dst (SubVI (Binary dst src2) mask));
 9976   match(Set dst (SubVL (Binary dst src2) mask));
 9977   match(Set dst (SubVF (Binary dst src2) mask));
 9978   match(Set dst (SubVD (Binary dst src2) mask));
 9979   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9980   ins_encode %{
 9981     int vlen_enc = vector_length_encoding(this);
 9982     BasicType bt = Matcher::vector_element_basic_type(this);
 9983     int opc = this->ideal_Opcode();
 9984     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9985                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9986   %}
 9987   ins_pipe( pipe_slow );
 9988 %}
 9989 
 9990 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9991   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9992   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9993   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9994   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9995   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9996   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9997   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9998   ins_encode %{
 9999     int vlen_enc = vector_length_encoding(this);
10000     BasicType bt = Matcher::vector_element_basic_type(this);
10001     int opc = this->ideal_Opcode();
10002     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10003                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10004   %}
10005   ins_pipe( pipe_slow );
10006 %}
10007 
10008 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
10009   match(Set dst (MulVS (Binary dst src2) mask));
10010   match(Set dst (MulVI (Binary dst src2) mask));
10011   match(Set dst (MulVL (Binary dst src2) mask));
10012   match(Set dst (MulVF (Binary dst src2) mask));
10013   match(Set dst (MulVD (Binary dst src2) mask));
10014   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10015   ins_encode %{
10016     int vlen_enc = vector_length_encoding(this);
10017     BasicType bt = Matcher::vector_element_basic_type(this);
10018     int opc = this->ideal_Opcode();
10019     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10020                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10021   %}
10022   ins_pipe( pipe_slow );
10023 %}
10024 
10025 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
10026   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
10027   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
10028   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
10029   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
10030   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
10031   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10032   ins_encode %{
10033     int vlen_enc = vector_length_encoding(this);
10034     BasicType bt = Matcher::vector_element_basic_type(this);
10035     int opc = this->ideal_Opcode();
10036     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10037                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10038   %}
10039   ins_pipe( pipe_slow );
10040 %}
10041 
10042 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
10043   match(Set dst (SqrtVF dst mask));
10044   match(Set dst (SqrtVD dst mask));
10045   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
10046   ins_encode %{
10047     int vlen_enc = vector_length_encoding(this);
10048     BasicType bt = Matcher::vector_element_basic_type(this);
10049     int opc = this->ideal_Opcode();
10050     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10051                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10052   %}
10053   ins_pipe( pipe_slow );
10054 %}
10055 
10056 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
10057   match(Set dst (DivVF (Binary dst src2) mask));
10058   match(Set dst (DivVD (Binary dst src2) mask));
10059   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10060   ins_encode %{
10061     int vlen_enc = vector_length_encoding(this);
10062     BasicType bt = Matcher::vector_element_basic_type(this);
10063     int opc = this->ideal_Opcode();
10064     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10065                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10066   %}
10067   ins_pipe( pipe_slow );
10068 %}
10069 
10070 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
10071   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
10072   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
10073   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10074   ins_encode %{
10075     int vlen_enc = vector_length_encoding(this);
10076     BasicType bt = Matcher::vector_element_basic_type(this);
10077     int opc = this->ideal_Opcode();
10078     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10079                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10080   %}
10081   ins_pipe( pipe_slow );
10082 %}
10083 
10084 
10085 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10086   match(Set dst (RotateLeftV (Binary dst shift) mask));
10087   match(Set dst (RotateRightV (Binary dst shift) mask));
10088   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10089   ins_encode %{
10090     int vlen_enc = vector_length_encoding(this);
10091     BasicType bt = Matcher::vector_element_basic_type(this);
10092     int opc = this->ideal_Opcode();
10093     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10094                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10095   %}
10096   ins_pipe( pipe_slow );
10097 %}
10098 
10099 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10100   match(Set dst (RotateLeftV (Binary dst src2) mask));
10101   match(Set dst (RotateRightV (Binary dst src2) mask));
10102   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10103   ins_encode %{
10104     int vlen_enc = vector_length_encoding(this);
10105     BasicType bt = Matcher::vector_element_basic_type(this);
10106     int opc = this->ideal_Opcode();
10107     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10108                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10109   %}
10110   ins_pipe( pipe_slow );
10111 %}
10112 
10113 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10114   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10115   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10116   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10117   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10118   ins_encode %{
10119     int vlen_enc = vector_length_encoding(this);
10120     BasicType bt = Matcher::vector_element_basic_type(this);
10121     int opc = this->ideal_Opcode();
10122     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10123                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10124   %}
10125   ins_pipe( pipe_slow );
10126 %}
10127 
10128 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10129   predicate(!n->as_ShiftV()->is_var_shift());
10130   match(Set dst (LShiftVS (Binary dst src2) mask));
10131   match(Set dst (LShiftVI (Binary dst src2) mask));
10132   match(Set dst (LShiftVL (Binary dst src2) mask));
10133   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10134   ins_encode %{
10135     int vlen_enc = vector_length_encoding(this);
10136     BasicType bt = Matcher::vector_element_basic_type(this);
10137     int opc = this->ideal_Opcode();
10138     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10139                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10140   %}
10141   ins_pipe( pipe_slow );
10142 %}
10143 
10144 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10145   predicate(n->as_ShiftV()->is_var_shift());
10146   match(Set dst (LShiftVS (Binary dst src2) mask));
10147   match(Set dst (LShiftVI (Binary dst src2) mask));
10148   match(Set dst (LShiftVL (Binary dst src2) mask));
10149   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10150   ins_encode %{
10151     int vlen_enc = vector_length_encoding(this);
10152     BasicType bt = Matcher::vector_element_basic_type(this);
10153     int opc = this->ideal_Opcode();
10154     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10155                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10156   %}
10157   ins_pipe( pipe_slow );
10158 %}
10159 
10160 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10161   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10162   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10163   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10164   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10165   ins_encode %{
10166     int vlen_enc = vector_length_encoding(this);
10167     BasicType bt = Matcher::vector_element_basic_type(this);
10168     int opc = this->ideal_Opcode();
10169     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10170                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10171   %}
10172   ins_pipe( pipe_slow );
10173 %}
10174 
10175 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10176   predicate(!n->as_ShiftV()->is_var_shift());
10177   match(Set dst (RShiftVS (Binary dst src2) mask));
10178   match(Set dst (RShiftVI (Binary dst src2) mask));
10179   match(Set dst (RShiftVL (Binary dst src2) mask));
10180   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10181   ins_encode %{
10182     int vlen_enc = vector_length_encoding(this);
10183     BasicType bt = Matcher::vector_element_basic_type(this);
10184     int opc = this->ideal_Opcode();
10185     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10186                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10187   %}
10188   ins_pipe( pipe_slow );
10189 %}
10190 
10191 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10192   predicate(n->as_ShiftV()->is_var_shift());
10193   match(Set dst (RShiftVS (Binary dst src2) mask));
10194   match(Set dst (RShiftVI (Binary dst src2) mask));
10195   match(Set dst (RShiftVL (Binary dst src2) mask));
10196   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10197   ins_encode %{
10198     int vlen_enc = vector_length_encoding(this);
10199     BasicType bt = Matcher::vector_element_basic_type(this);
10200     int opc = this->ideal_Opcode();
10201     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10202                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10203   %}
10204   ins_pipe( pipe_slow );
10205 %}
10206 
10207 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10208   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10209   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10210   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10211   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10212   ins_encode %{
10213     int vlen_enc = vector_length_encoding(this);
10214     BasicType bt = Matcher::vector_element_basic_type(this);
10215     int opc = this->ideal_Opcode();
10216     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10217                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10218   %}
10219   ins_pipe( pipe_slow );
10220 %}
10221 
10222 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10223   predicate(!n->as_ShiftV()->is_var_shift());
10224   match(Set dst (URShiftVS (Binary dst src2) mask));
10225   match(Set dst (URShiftVI (Binary dst src2) mask));
10226   match(Set dst (URShiftVL (Binary dst src2) mask));
10227   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10228   ins_encode %{
10229     int vlen_enc = vector_length_encoding(this);
10230     BasicType bt = Matcher::vector_element_basic_type(this);
10231     int opc = this->ideal_Opcode();
10232     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10233                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10234   %}
10235   ins_pipe( pipe_slow );
10236 %}
10237 
10238 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10239   predicate(n->as_ShiftV()->is_var_shift());
10240   match(Set dst (URShiftVS (Binary dst src2) mask));
10241   match(Set dst (URShiftVI (Binary dst src2) mask));
10242   match(Set dst (URShiftVL (Binary dst src2) mask));
10243   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10244   ins_encode %{
10245     int vlen_enc = vector_length_encoding(this);
10246     BasicType bt = Matcher::vector_element_basic_type(this);
10247     int opc = this->ideal_Opcode();
10248     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10249                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10250   %}
10251   ins_pipe( pipe_slow );
10252 %}
10253 
10254 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10255   match(Set dst (MaxV (Binary dst src2) mask));
10256   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10257   ins_encode %{
10258     int vlen_enc = vector_length_encoding(this);
10259     BasicType bt = Matcher::vector_element_basic_type(this);
10260     int opc = this->ideal_Opcode();
10261     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10262                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10263   %}
10264   ins_pipe( pipe_slow );
10265 %}
10266 
10267 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10268   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10269   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10270   ins_encode %{
10271     int vlen_enc = vector_length_encoding(this);
10272     BasicType bt = Matcher::vector_element_basic_type(this);
10273     int opc = this->ideal_Opcode();
10274     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10275                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10276   %}
10277   ins_pipe( pipe_slow );
10278 %}
10279 
10280 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10281   match(Set dst (MinV (Binary dst src2) mask));
10282   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10283   ins_encode %{
10284     int vlen_enc = vector_length_encoding(this);
10285     BasicType bt = Matcher::vector_element_basic_type(this);
10286     int opc = this->ideal_Opcode();
10287     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10288                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10289   %}
10290   ins_pipe( pipe_slow );
10291 %}
10292 
10293 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10294   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10295   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10296   ins_encode %{
10297     int vlen_enc = vector_length_encoding(this);
10298     BasicType bt = Matcher::vector_element_basic_type(this);
10299     int opc = this->ideal_Opcode();
10300     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10301                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10302   %}
10303   ins_pipe( pipe_slow );
10304 %}
10305 
10306 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10307   match(Set dst (VectorRearrange (Binary dst src2) mask));
10308   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10309   ins_encode %{
10310     int vlen_enc = vector_length_encoding(this);
10311     BasicType bt = Matcher::vector_element_basic_type(this);
10312     int opc = this->ideal_Opcode();
10313     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10314                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10315   %}
10316   ins_pipe( pipe_slow );
10317 %}
10318 
10319 instruct vabs_masked(vec dst, kReg mask) %{
10320   match(Set dst (AbsVB dst mask));
10321   match(Set dst (AbsVS dst mask));
10322   match(Set dst (AbsVI dst mask));
10323   match(Set dst (AbsVL dst mask));
10324   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10325   ins_encode %{
10326     int vlen_enc = vector_length_encoding(this);
10327     BasicType bt = Matcher::vector_element_basic_type(this);
10328     int opc = this->ideal_Opcode();
10329     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10330                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10331   %}
10332   ins_pipe( pipe_slow );
10333 %}
10334 
10335 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10336   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10337   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10338   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10339   ins_encode %{
10340     assert(UseFMA, "Needs FMA instructions support.");
10341     int vlen_enc = vector_length_encoding(this);
10342     BasicType bt = Matcher::vector_element_basic_type(this);
10343     int opc = this->ideal_Opcode();
10344     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10345                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10346   %}
10347   ins_pipe( pipe_slow );
10348 %}
10349 
10350 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10351   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10352   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10353   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10354   ins_encode %{
10355     assert(UseFMA, "Needs FMA instructions support.");
10356     int vlen_enc = vector_length_encoding(this);
10357     BasicType bt = Matcher::vector_element_basic_type(this);
10358     int opc = this->ideal_Opcode();
10359     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10360                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10361   %}
10362   ins_pipe( pipe_slow );
10363 %}
10364 
10365 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10366   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10367   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10368   ins_encode %{
10369     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10370     int vlen_enc = vector_length_encoding(this, $src1);
10371     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10372 
10373     // Comparison i
10374     switch (src1_elem_bt) {
10375       case T_BYTE: {
10376         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10377         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10378         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10379         break;
10380       }
10381       case T_SHORT: {
10382         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10383         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10384         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10385         break;
10386       }
10387       case T_INT: {
10388         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10389         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10390         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10391         break;
10392       }
10393       case T_LONG: {
10394         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10395         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10396         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10397         break;
10398       }
10399       case T_FLOAT: {
10400         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10401         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10402         break;
10403       }
10404       case T_DOUBLE: {
10405         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10406         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10407         break;
10408       }
10409       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10410     }
10411   %}
10412   ins_pipe( pipe_slow );
10413 %}
10414 
10415 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10416   predicate(Matcher::vector_length(n) <= 32);
10417   match(Set dst (MaskAll src));
10418   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10419   ins_encode %{
10420     int mask_len = Matcher::vector_length(this);
10421     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10422   %}
10423   ins_pipe( pipe_slow );
10424 %}
10425 
10426 #ifdef _LP64
10427 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10428   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10429   match(Set dst (XorVMask src (MaskAll cnt)));
10430   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10431   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10432   ins_encode %{
10433     uint masklen = Matcher::vector_length(this);
10434     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10435   %}
10436   ins_pipe( pipe_slow );
10437 %}
10438 
10439 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10440   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10441             (Matcher::vector_length(n) == 16) ||
10442             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10443   match(Set dst (XorVMask src (MaskAll cnt)));
10444   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10445   ins_encode %{
10446     uint masklen = Matcher::vector_length(this);
10447     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10448   %}
10449   ins_pipe( pipe_slow );
10450 %}
10451 
10452 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10453   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10454   match(Set dst (VectorLongToMask src));
10455   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10456   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10457   ins_encode %{
10458     int mask_len = Matcher::vector_length(this);
10459     int vec_enc  = vector_length_encoding(mask_len);
10460     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10461                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10462   %}
10463   ins_pipe( pipe_slow );
10464 %}
10465 
10466 
10467 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10468   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10469   match(Set dst (VectorLongToMask src));
10470   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10471   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10472   ins_encode %{
10473     int mask_len = Matcher::vector_length(this);
10474     assert(mask_len <= 32, "invalid mask length");
10475     int vec_enc  = vector_length_encoding(mask_len);
10476     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10477                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10478   %}
10479   ins_pipe( pipe_slow );
10480 %}
10481 
10482 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10483   predicate(n->bottom_type()->isa_vectmask());
10484   match(Set dst (VectorLongToMask src));
10485   format %{ "long_to_mask_evex $dst, $src\t!" %}
10486   ins_encode %{
10487     __ kmov($dst$$KRegister, $src$$Register);
10488   %}
10489   ins_pipe( pipe_slow );
10490 %}
10491 #endif
10492 
10493 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10494   match(Set dst (AndVMask src1 src2));
10495   match(Set dst (OrVMask src1 src2));
10496   match(Set dst (XorVMask src1 src2));
10497   effect(TEMP kscratch);
10498   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10499   ins_encode %{
10500     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10501     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10502     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10503     uint masklen = Matcher::vector_length(this);
10504     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10505     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10506   %}
10507   ins_pipe( pipe_slow );
10508 %}
10509 
10510 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10511   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10512   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10513   ins_encode %{
10514     int vlen_enc = vector_length_encoding(this);
10515     BasicType bt = Matcher::vector_element_basic_type(this);
10516     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10517                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10518   %}
10519   ins_pipe( pipe_slow );
10520 %}
10521 
10522 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10523   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10524   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10525   ins_encode %{
10526     int vlen_enc = vector_length_encoding(this);
10527     BasicType bt = Matcher::vector_element_basic_type(this);
10528     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10529                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10530   %}
10531   ins_pipe( pipe_slow );
10532 %}
10533 
10534 instruct castMM(kReg dst)
10535 %{
10536   match(Set dst (CastVV dst));
10537 
10538   size(0);
10539   format %{ "# castVV of $dst" %}
10540   ins_encode(/* empty encoding */);
10541   ins_cost(0);
10542   ins_pipe(empty);
10543 %}
10544 
10545 instruct castVV(vec dst)
10546 %{
10547   match(Set dst (CastVV dst));
10548 
10549   size(0);
10550   format %{ "# castVV of $dst" %}
10551   ins_encode(/* empty encoding */);
10552   ins_cost(0);
10553   ins_pipe(empty);
10554 %}
10555 
10556 instruct castVVLeg(legVec dst)
10557 %{
10558   match(Set dst (CastVV dst));
10559 
10560   size(0);
10561   format %{ "# castVV of $dst" %}
10562   ins_encode(/* empty encoding */);
10563   ins_cost(0);
10564   ins_pipe(empty);
10565 %}
10566 
10567 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10568 %{
10569   match(Set dst (IsInfiniteF src));
10570   effect(TEMP ktmp, KILL cr);
10571   format %{ "float_class_check $dst, $src" %}
10572   ins_encode %{
10573     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10574     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10575   %}
10576   ins_pipe(pipe_slow);
10577 %}
10578 
10579 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10580 %{
10581   match(Set dst (IsInfiniteD src));
10582   effect(TEMP ktmp, KILL cr);
10583   format %{ "double_class_check $dst, $src" %}
10584   ins_encode %{
10585     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10586     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10587   %}
10588   ins_pipe(pipe_slow);
10589 %}
10590 
10591 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10592 %{
10593   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10594             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10595   match(Set dst (SaturatingAddV src1 src2));
10596   match(Set dst (SaturatingSubV src1 src2));
10597   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10598   ins_encode %{
10599     int vlen_enc = vector_length_encoding(this);
10600     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10601     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10602                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10603   %}
10604   ins_pipe(pipe_slow);
10605 %}
10606 
10607 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10608 %{
10609   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10610             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10611   match(Set dst (SaturatingAddV src1 src2));
10612   match(Set dst (SaturatingSubV src1 src2));
10613   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10614   ins_encode %{
10615     int vlen_enc = vector_length_encoding(this);
10616     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10617     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10618                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10619   %}
10620   ins_pipe(pipe_slow);
10621 %}
10622 
10623 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10624 %{
10625   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10626             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10627             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10628   match(Set dst (SaturatingAddV src1 src2));
10629   match(Set dst (SaturatingSubV src1 src2));
10630   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10631   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10632   ins_encode %{
10633     int vlen_enc = vector_length_encoding(this);
10634     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10635     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10636                                         $src1$$XMMRegister, $src2$$XMMRegister,
10637                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10638                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10639   %}
10640   ins_pipe(pipe_slow);
10641 %}
10642 
10643 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10644 %{
10645   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10646             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10647             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10648   match(Set dst (SaturatingAddV src1 src2));
10649   match(Set dst (SaturatingSubV src1 src2));
10650   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10651   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10652   ins_encode %{
10653     int vlen_enc = vector_length_encoding(this);
10654     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10655     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10656                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10657                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10658   %}
10659   ins_pipe(pipe_slow);
10660 %}
10661 
10662 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10663 %{
10664   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10665             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10666             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10667   match(Set dst (SaturatingAddV src1 src2));
10668   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10669   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10670   ins_encode %{
10671     int vlen_enc = vector_length_encoding(this);
10672     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10673     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10674                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10675   %}
10676   ins_pipe(pipe_slow);
10677 %}
10678 
10679 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10680 %{
10681   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10682             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10683             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10684   match(Set dst (SaturatingAddV src1 src2));
10685   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10686   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10687   ins_encode %{
10688     int vlen_enc = vector_length_encoding(this);
10689     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10690     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10691                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10692   %}
10693   ins_pipe(pipe_slow);
10694 %}
10695 
10696 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10697 %{
10698   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10699             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10700             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10701   match(Set dst (SaturatingSubV src1 src2));
10702   effect(TEMP ktmp);
10703   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10704   ins_encode %{
10705     int vlen_enc = vector_length_encoding(this);
10706     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10707     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10708                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10709   %}
10710   ins_pipe(pipe_slow);
10711 %}
10712 
10713 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10714 %{
10715   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10716             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10717             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10718   match(Set dst (SaturatingSubV src1 src2));
10719   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10720   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10721   ins_encode %{
10722     int vlen_enc = vector_length_encoding(this);
10723     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10724     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10725                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10726   %}
10727   ins_pipe(pipe_slow);
10728 %}
10729 
10730 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10731 %{
10732   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10733             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10734   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10735   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10736   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10737   ins_encode %{
10738     int vlen_enc = vector_length_encoding(this);
10739     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10740     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10741                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10742   %}
10743   ins_pipe(pipe_slow);
10744 %}
10745 
10746 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10747 %{
10748   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10749             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10750   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10751   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10752   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10753   ins_encode %{
10754     int vlen_enc = vector_length_encoding(this);
10755     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10756     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10757                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10758   %}
10759   ins_pipe(pipe_slow);
10760 %}
10761 
10762 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10763   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10764             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10765   match(Set dst (SaturatingAddV (Binary dst src) mask));
10766   match(Set dst (SaturatingSubV (Binary dst src) mask));
10767   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10768   ins_encode %{
10769     int vlen_enc = vector_length_encoding(this);
10770     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10771     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10772                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10773   %}
10774   ins_pipe( pipe_slow );
10775 %}
10776 
10777 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10778   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10779             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10780   match(Set dst (SaturatingAddV (Binary dst src) mask));
10781   match(Set dst (SaturatingSubV (Binary dst src) mask));
10782   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10783   ins_encode %{
10784     int vlen_enc = vector_length_encoding(this);
10785     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10786     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10787                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10788   %}
10789   ins_pipe( pipe_slow );
10790 %}
10791 
10792 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10793   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10794             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10795   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10796   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10797   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10798   ins_encode %{
10799     int vlen_enc = vector_length_encoding(this);
10800     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10801     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10802                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10803   %}
10804   ins_pipe( pipe_slow );
10805 %}
10806 
10807 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10808   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10809             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10810   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10811   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10812   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10813   ins_encode %{
10814     int vlen_enc = vector_length_encoding(this);
10815     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10816     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10817                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10818   %}
10819   ins_pipe( pipe_slow );
10820 %}
10821 
10822 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10823 %{
10824   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10825   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10826   ins_encode %{
10827     int vlen_enc = vector_length_encoding(this);
10828     BasicType bt = Matcher::vector_element_basic_type(this);
10829     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10830   %}
10831   ins_pipe(pipe_slow);
10832 %}