1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AddReductionVL:
 1372       if (UseSSE < 2) { // requires at least SSE2
 1373         return false;
 1374       }
 1375       break;
 1376     case Op_AbsVB:
 1377     case Op_AbsVS:
 1378     case Op_AbsVI:
 1379     case Op_AddReductionVI:
 1380     case Op_AndReductionV:
 1381     case Op_OrReductionV:
 1382     case Op_XorReductionV:
 1383       if (UseSSE < 3) { // requires at least SSSE3
 1384         return false;
 1385       }
 1386       break;
 1387     case Op_MaxHF:
 1388     case Op_MinHF:
 1389       if (!VM_Version::supports_avx512vlbw()) {
 1390         return false;
 1391       }  // fallthrough
 1392     case Op_AddHF:
 1393     case Op_DivHF:
 1394     case Op_FmaHF:
 1395     case Op_MulHF:
 1396     case Op_ReinterpretS2HF:
 1397     case Op_ReinterpretHF2S:
 1398     case Op_SubHF:
 1399     case Op_SqrtHF:
 1400       if (!VM_Version::supports_avx512_fp16()) {
 1401         return false;
 1402       }
 1403       break;
 1404     case Op_VectorLoadShuffle:
 1405     case Op_VectorRearrange:
 1406     case Op_MulReductionVI:
 1407       if (UseSSE < 4) { // requires at least SSE4
 1408         return false;
 1409       }
 1410       break;
 1411     case Op_IsInfiniteF:
 1412     case Op_IsInfiniteD:
 1413       if (!VM_Version::supports_avx512dq()) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_SqrtVD:
 1418     case Op_SqrtVF:
 1419     case Op_VectorMaskCmp:
 1420     case Op_VectorCastB2X:
 1421     case Op_VectorCastS2X:
 1422     case Op_VectorCastI2X:
 1423     case Op_VectorCastL2X:
 1424     case Op_VectorCastF2X:
 1425     case Op_VectorCastD2X:
 1426     case Op_VectorUCastB2X:
 1427     case Op_VectorUCastS2X:
 1428     case Op_VectorUCastI2X:
 1429     case Op_VectorMaskCast:
 1430       if (UseAVX < 1) { // enabled for AVX only
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_PopulateIndex:
 1435       if (UseAVX < 2) {
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVF:
 1440       if (UseAVX < 2) { // enabled for AVX2 only
 1441         return false;
 1442       }
 1443       break;
 1444     case Op_RoundVD:
 1445       if (UseAVX < 3) {
 1446         return false;  // enabled for AVX3 only
 1447       }
 1448       break;
 1449     case Op_CompareAndSwapL:
 1450     case Op_CompareAndSwapP:
 1451       break;
 1452     case Op_StrIndexOf:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_StrIndexOfChar:
 1458       if (!UseSSE42Intrinsics) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_OnSpinWait:
 1463       if (VM_Version::supports_on_spin_wait() == false) {
 1464         return false;
 1465       }
 1466       break;
 1467     case Op_MulVB:
 1468     case Op_LShiftVB:
 1469     case Op_RShiftVB:
 1470     case Op_URShiftVB:
 1471     case Op_VectorInsert:
 1472     case Op_VectorLoadMask:
 1473     case Op_VectorStoreMask:
 1474     case Op_VectorBlend:
 1475       if (UseSSE < 4) {
 1476         return false;
 1477       }
 1478       break;
 1479     case Op_MaxD:
 1480     case Op_MaxF:
 1481     case Op_MinD:
 1482     case Op_MinF:
 1483       if (UseAVX < 1) { // enabled for AVX only
 1484         return false;
 1485       }
 1486       break;
 1487     case Op_CacheWB:
 1488     case Op_CacheWBPreSync:
 1489     case Op_CacheWBPostSync:
 1490       if (!VM_Version::supports_data_cache_line_flush()) {
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_ExtractB:
 1495     case Op_ExtractL:
 1496     case Op_ExtractI:
 1497     case Op_RoundDoubleMode:
 1498       if (UseSSE < 4) {
 1499         return false;
 1500       }
 1501       break;
 1502     case Op_RoundDoubleModeV:
 1503       if (VM_Version::supports_avx() == false) {
 1504         return false; // 128bit vroundpd is not available
 1505       }
 1506       break;
 1507     case Op_LoadVectorGather:
 1508     case Op_LoadVectorGatherMasked:
 1509       if (UseAVX < 2) {
 1510         return false;
 1511       }
 1512       break;
 1513     case Op_FmaF:
 1514     case Op_FmaD:
 1515     case Op_FmaVD:
 1516     case Op_FmaVF:
 1517       if (!UseFMA) {
 1518         return false;
 1519       }
 1520       break;
 1521     case Op_MacroLogicV:
 1522       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1523         return false;
 1524       }
 1525       break;
 1526 
 1527     case Op_VectorCmpMasked:
 1528     case Op_VectorMaskGen:
 1529       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1530         return false;
 1531       }
 1532       break;
 1533     case Op_VectorMaskFirstTrue:
 1534     case Op_VectorMaskLastTrue:
 1535     case Op_VectorMaskTrueCount:
 1536     case Op_VectorMaskToLong:
 1537       if (UseAVX < 1) {
 1538          return false;
 1539       }
 1540       break;
 1541     case Op_RoundF:
 1542     case Op_RoundD:
 1543       break;
 1544     case Op_CopySignD:
 1545     case Op_CopySignF:
 1546       if (UseAVX < 3)  {
 1547         return false;
 1548       }
 1549       if (!VM_Version::supports_avx512vl()) {
 1550         return false;
 1551       }
 1552       break;
 1553     case Op_CompressBits:
 1554     case Op_ExpandBits:
 1555       if (!VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_SignumF:
 1560       if (UseSSE < 1) {
 1561         return false;
 1562       }
 1563       break;
 1564     case Op_SignumD:
 1565       if (UseSSE < 2) {
 1566         return false;
 1567       }
 1568       break;
 1569     case Op_CompressM:
 1570       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1571         return false;
 1572       }
 1573       break;
 1574     case Op_SqrtF:
 1575       if (UseSSE < 1) {
 1576         return false;
 1577       }
 1578       break;
 1579     case Op_SqrtD:
 1580       if (UseSSE < 2) {
 1581         return false;
 1582       }
 1583       break;
 1584     case Op_ConvF2HF:
 1585     case Op_ConvHF2F:
 1586       if (!VM_Version::supports_float16()) {
 1587         return false;
 1588       }
 1589       break;
 1590     case Op_VectorCastF2HF:
 1591     case Op_VectorCastHF2F:
 1592       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1593         return false;
 1594       }
 1595       break;
 1596   }
 1597   return true;  // Match rules are supported by default.
 1598 }
 1599 
 1600 //------------------------------------------------------------------------
 1601 
 1602 static inline bool is_pop_count_instr_target(BasicType bt) {
 1603   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1604          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1605 }
 1606 
 1607 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1608   return match_rule_supported_vector(opcode, vlen, bt);
 1609 }
 1610 
 1611 // Identify extra cases that we might want to provide match rules for vector nodes and
 1612 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1613 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1614   if (!match_rule_supported(opcode)) {
 1615     return false;
 1616   }
 1617   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1618   //   * SSE2 supports 128bit vectors for all types;
 1619   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1620   //   * AVX2 supports 256bit vectors for all types;
 1621   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1622   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1623   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1624   // And MaxVectorSize is taken into account as well.
 1625   if (!vector_size_supported(bt, vlen)) {
 1626     return false;
 1627   }
 1628   // Special cases which require vector length follow:
 1629   //   * implementation limitations
 1630   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1631   //   * 128bit vroundpd instruction is present only in AVX1
 1632   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1633   switch (opcode) {
 1634     case Op_AbsVF:
 1635     case Op_NegVF:
 1636       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1637         return false; // 512bit vandps and vxorps are not available
 1638       }
 1639       break;
 1640     case Op_AbsVD:
 1641     case Op_NegVD:
 1642       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1643         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1644       }
 1645       break;
 1646     case Op_RotateRightV:
 1647     case Op_RotateLeftV:
 1648       if (bt != T_INT && bt != T_LONG) {
 1649         return false;
 1650       } // fallthrough
 1651     case Op_MacroLogicV:
 1652       if (!VM_Version::supports_evex() ||
 1653           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1654         return false;
 1655       }
 1656       break;
 1657     case Op_ClearArray:
 1658     case Op_VectorMaskGen:
 1659     case Op_VectorCmpMasked:
 1660       if (!VM_Version::supports_avx512bw()) {
 1661         return false;
 1662       }
 1663       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1664         return false;
 1665       }
 1666       break;
 1667     case Op_LoadVectorMasked:
 1668     case Op_StoreVectorMasked:
 1669       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1670         return false;
 1671       }
 1672       break;
 1673     case Op_UMinV:
 1674     case Op_UMaxV:
 1675       if (UseAVX == 0) {
 1676         return false;
 1677       }
 1678       break;
 1679     case Op_MaxV:
 1680     case Op_MinV:
 1681       if (UseSSE < 4 && is_integral_type(bt)) {
 1682         return false;
 1683       }
 1684       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1685           // Float/Double intrinsics are enabled for AVX family currently.
 1686           if (UseAVX == 0) {
 1687             return false;
 1688           }
 1689           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1690             return false;
 1691           }
 1692       }
 1693       break;
 1694     case Op_CallLeafVector:
 1695       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1696         return false;
 1697       }
 1698       break;
 1699     case Op_AddReductionVI:
 1700       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1701         return false;
 1702       }
 1703       // fallthrough
 1704     case Op_AndReductionV:
 1705     case Op_OrReductionV:
 1706     case Op_XorReductionV:
 1707       if (is_subword_type(bt) && (UseSSE < 4)) {
 1708         return false;
 1709       }
 1710       break;
 1711     case Op_MinReductionV:
 1712     case Op_MaxReductionV:
 1713       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1714         return false;
 1715       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1716         return false;
 1717       }
 1718       // Float/Double intrinsics enabled for AVX family.
 1719       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1720         return false;
 1721       }
 1722       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1723         return false;
 1724       }
 1725       break;
 1726     case Op_VectorTest:
 1727       if (UseSSE < 4) {
 1728         return false; // Implementation limitation
 1729       } else if (size_in_bits < 32) {
 1730         return false; // Implementation limitation
 1731       }
 1732       break;
 1733     case Op_VectorLoadShuffle:
 1734     case Op_VectorRearrange:
 1735       if(vlen == 2) {
 1736         return false; // Implementation limitation due to how shuffle is loaded
 1737       } else if (size_in_bits == 256 && UseAVX < 2) {
 1738         return false; // Implementation limitation
 1739       }
 1740       break;
 1741     case Op_VectorLoadMask:
 1742     case Op_VectorMaskCast:
 1743       if (size_in_bits == 256 && UseAVX < 2) {
 1744         return false; // Implementation limitation
 1745       }
 1746       // fallthrough
 1747     case Op_VectorStoreMask:
 1748       if (vlen == 2) {
 1749         return false; // Implementation limitation
 1750       }
 1751       break;
 1752     case Op_PopulateIndex:
 1753       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastB2X:
 1758     case Op_VectorCastS2X:
 1759     case Op_VectorCastI2X:
 1760       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastL2X:
 1765       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1766         return false;
 1767       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1768         return false;
 1769       }
 1770       break;
 1771     case Op_VectorCastF2X: {
 1772         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1773         // happen after intermediate conversion to integer and special handling
 1774         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1775         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1776         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1777           return false;
 1778         }
 1779       }
 1780       // fallthrough
 1781     case Op_VectorCastD2X:
 1782       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1783         return false;
 1784       }
 1785       break;
 1786     case Op_VectorCastF2HF:
 1787     case Op_VectorCastHF2F:
 1788       if (!VM_Version::supports_f16c() &&
 1789          ((!VM_Version::supports_evex() ||
 1790          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1791         return false;
 1792       }
 1793       break;
 1794     case Op_RoundVD:
 1795       if (!VM_Version::supports_avx512dq()) {
 1796         return false;
 1797       }
 1798       break;
 1799     case Op_MulReductionVI:
 1800       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1801         return false;
 1802       }
 1803       break;
 1804     case Op_LoadVectorGatherMasked:
 1805       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1806         return false;
 1807       }
 1808       if (is_subword_type(bt) &&
 1809          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1810           (size_in_bits < 64)                                      ||
 1811           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1812         return false;
 1813       }
 1814       break;
 1815     case Op_StoreVectorScatterMasked:
 1816     case Op_StoreVectorScatter:
 1817       if (is_subword_type(bt)) {
 1818         return false;
 1819       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1820         return false;
 1821       }
 1822       // fallthrough
 1823     case Op_LoadVectorGather:
 1824       if (!is_subword_type(bt) && size_in_bits == 64) {
 1825         return false;
 1826       }
 1827       if (is_subword_type(bt) && size_in_bits < 64) {
 1828         return false;
 1829       }
 1830       break;
 1831     case Op_SaturatingAddV:
 1832     case Op_SaturatingSubV:
 1833       if (UseAVX < 1) {
 1834         return false; // Implementation limitation
 1835       }
 1836       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1837         return false;
 1838       }
 1839       break;
 1840     case Op_SelectFromTwoVector:
 1841        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1842          return false;
 1843        }
 1844        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1845          return false;
 1846        }
 1847        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1848          return false;
 1849        }
 1850        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1851          return false;
 1852        }
 1853        break;
 1854     case Op_MaskAll:
 1855       if (!VM_Version::supports_evex()) {
 1856         return false;
 1857       }
 1858       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1859         return false;
 1860       }
 1861       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1862         return false;
 1863       }
 1864       break;
 1865     case Op_VectorMaskCmp:
 1866       if (vlen < 2 || size_in_bits < 32) {
 1867         return false;
 1868       }
 1869       break;
 1870     case Op_CompressM:
 1871       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1872         return false;
 1873       }
 1874       break;
 1875     case Op_CompressV:
 1876     case Op_ExpandV:
 1877       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1878         return false;
 1879       }
 1880       if (size_in_bits < 128 ) {
 1881         return false;
 1882       }
 1883     case Op_VectorLongToMask:
 1884       if (UseAVX < 1) {
 1885         return false;
 1886       }
 1887       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1888         return false;
 1889       }
 1890       break;
 1891     case Op_SignumVD:
 1892     case Op_SignumVF:
 1893       if (UseAVX < 1) {
 1894         return false;
 1895       }
 1896       break;
 1897     case Op_PopCountVI:
 1898     case Op_PopCountVL: {
 1899         if (!is_pop_count_instr_target(bt) &&
 1900             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1901           return false;
 1902         }
 1903       }
 1904       break;
 1905     case Op_ReverseV:
 1906     case Op_ReverseBytesV:
 1907       if (UseAVX < 2) {
 1908         return false;
 1909       }
 1910       break;
 1911     case Op_CountTrailingZerosV:
 1912     case Op_CountLeadingZerosV:
 1913       if (UseAVX < 2) {
 1914         return false;
 1915       }
 1916       break;
 1917   }
 1918   return true;  // Per default match rules are supported.
 1919 }
 1920 
 1921 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1922   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1923   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1924   // of their non-masked counterpart with mask edge being the differentiator.
 1925   // This routine does a strict check on the existence of masked operation patterns
 1926   // by returning a default false value for all the other opcodes apart from the
 1927   // ones whose masked instruction patterns are defined in this file.
 1928   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1929     return false;
 1930   }
 1931 
 1932   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1933   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1934     return false;
 1935   }
 1936   switch(opcode) {
 1937     // Unary masked operations
 1938     case Op_AbsVB:
 1939     case Op_AbsVS:
 1940       if(!VM_Version::supports_avx512bw()) {
 1941         return false;  // Implementation limitation
 1942       }
 1943     case Op_AbsVI:
 1944     case Op_AbsVL:
 1945       return true;
 1946 
 1947     // Ternary masked operations
 1948     case Op_FmaVF:
 1949     case Op_FmaVD:
 1950       return true;
 1951 
 1952     case Op_MacroLogicV:
 1953       if(bt != T_INT && bt != T_LONG) {
 1954         return false;
 1955       }
 1956       return true;
 1957 
 1958     // Binary masked operations
 1959     case Op_AddVB:
 1960     case Op_AddVS:
 1961     case Op_SubVB:
 1962     case Op_SubVS:
 1963     case Op_MulVS:
 1964     case Op_LShiftVS:
 1965     case Op_RShiftVS:
 1966     case Op_URShiftVS:
 1967       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1968       if (!VM_Version::supports_avx512bw()) {
 1969         return false;  // Implementation limitation
 1970       }
 1971       return true;
 1972 
 1973     case Op_MulVL:
 1974       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1975       if (!VM_Version::supports_avx512dq()) {
 1976         return false;  // Implementation limitation
 1977       }
 1978       return true;
 1979 
 1980     case Op_AndV:
 1981     case Op_OrV:
 1982     case Op_XorV:
 1983     case Op_RotateRightV:
 1984     case Op_RotateLeftV:
 1985       if (bt != T_INT && bt != T_LONG) {
 1986         return false; // Implementation limitation
 1987       }
 1988       return true;
 1989 
 1990     case Op_VectorLoadMask:
 1991       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1992       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1993         return false;
 1994       }
 1995       return true;
 1996 
 1997     case Op_AddVI:
 1998     case Op_AddVL:
 1999     case Op_AddVF:
 2000     case Op_AddVD:
 2001     case Op_SubVI:
 2002     case Op_SubVL:
 2003     case Op_SubVF:
 2004     case Op_SubVD:
 2005     case Op_MulVI:
 2006     case Op_MulVF:
 2007     case Op_MulVD:
 2008     case Op_DivVF:
 2009     case Op_DivVD:
 2010     case Op_SqrtVF:
 2011     case Op_SqrtVD:
 2012     case Op_LShiftVI:
 2013     case Op_LShiftVL:
 2014     case Op_RShiftVI:
 2015     case Op_RShiftVL:
 2016     case Op_URShiftVI:
 2017     case Op_URShiftVL:
 2018     case Op_LoadVectorMasked:
 2019     case Op_StoreVectorMasked:
 2020     case Op_LoadVectorGatherMasked:
 2021     case Op_StoreVectorScatterMasked:
 2022       return true;
 2023 
 2024     case Op_UMinV:
 2025     case Op_UMaxV:
 2026       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2027         return false;
 2028       } // fallthrough
 2029     case Op_MaxV:
 2030     case Op_MinV:
 2031       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2032         return false; // Implementation limitation
 2033       }
 2034       if (is_floating_point_type(bt)) {
 2035         return false; // Implementation limitation
 2036       }
 2037       return true;
 2038     case Op_SaturatingAddV:
 2039     case Op_SaturatingSubV:
 2040       if (!is_subword_type(bt)) {
 2041         return false;
 2042       }
 2043       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2044         return false; // Implementation limitation
 2045       }
 2046       return true;
 2047 
 2048     case Op_VectorMaskCmp:
 2049       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2050         return false; // Implementation limitation
 2051       }
 2052       return true;
 2053 
 2054     case Op_VectorRearrange:
 2055       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2056         return false; // Implementation limitation
 2057       }
 2058       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2059         return false; // Implementation limitation
 2060       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2061         return false; // Implementation limitation
 2062       }
 2063       return true;
 2064 
 2065     // Binary Logical operations
 2066     case Op_AndVMask:
 2067     case Op_OrVMask:
 2068     case Op_XorVMask:
 2069       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2070         return false; // Implementation limitation
 2071       }
 2072       return true;
 2073 
 2074     case Op_PopCountVI:
 2075     case Op_PopCountVL:
 2076       if (!is_pop_count_instr_target(bt)) {
 2077         return false;
 2078       }
 2079       return true;
 2080 
 2081     case Op_MaskAll:
 2082       return true;
 2083 
 2084     case Op_CountLeadingZerosV:
 2085       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2086         return true;
 2087       }
 2088     default:
 2089       return false;
 2090   }
 2091 }
 2092 
 2093 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2094   return false;
 2095 }
 2096 
 2097 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2098 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2099   switch (elem_bt) {
 2100     case T_BYTE:  return false;
 2101     case T_SHORT: return !VM_Version::supports_avx512bw();
 2102     case T_INT:   return !VM_Version::supports_avx();
 2103     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2104     default:
 2105       ShouldNotReachHere();
 2106       return false;
 2107   }
 2108 }
 2109 
 2110 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2111   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2112   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2113   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2114       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2115     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2116     return new legVecZOper();
 2117   }
 2118   if (legacy) {
 2119     switch (ideal_reg) {
 2120       case Op_VecS: return new legVecSOper();
 2121       case Op_VecD: return new legVecDOper();
 2122       case Op_VecX: return new legVecXOper();
 2123       case Op_VecY: return new legVecYOper();
 2124       case Op_VecZ: return new legVecZOper();
 2125     }
 2126   } else {
 2127     switch (ideal_reg) {
 2128       case Op_VecS: return new vecSOper();
 2129       case Op_VecD: return new vecDOper();
 2130       case Op_VecX: return new vecXOper();
 2131       case Op_VecY: return new vecYOper();
 2132       case Op_VecZ: return new vecZOper();
 2133     }
 2134   }
 2135   ShouldNotReachHere();
 2136   return nullptr;
 2137 }
 2138 
 2139 bool Matcher::is_reg2reg_move(MachNode* m) {
 2140   switch (m->rule()) {
 2141     case MoveVec2Leg_rule:
 2142     case MoveLeg2Vec_rule:
 2143     case MoveF2VL_rule:
 2144     case MoveF2LEG_rule:
 2145     case MoveVL2F_rule:
 2146     case MoveLEG2F_rule:
 2147     case MoveD2VL_rule:
 2148     case MoveD2LEG_rule:
 2149     case MoveVL2D_rule:
 2150     case MoveLEG2D_rule:
 2151       return true;
 2152     default:
 2153       return false;
 2154   }
 2155 }
 2156 
 2157 bool Matcher::is_generic_vector(MachOper* opnd) {
 2158   switch (opnd->opcode()) {
 2159     case VEC:
 2160     case LEGVEC:
 2161       return true;
 2162     default:
 2163       return false;
 2164   }
 2165 }
 2166 
 2167 //------------------------------------------------------------------------
 2168 
 2169 const RegMask* Matcher::predicate_reg_mask(void) {
 2170   return &_VECTMASK_REG_mask;
 2171 }
 2172 
 2173 // Max vector size in bytes. 0 if not supported.
 2174 int Matcher::vector_width_in_bytes(BasicType bt) {
 2175   assert(is_java_primitive(bt), "only primitive type vectors");
 2176   if (UseSSE < 2) return 0;
 2177   // SSE2 supports 128bit vectors for all types.
 2178   // AVX2 supports 256bit vectors for all types.
 2179   // AVX2/EVEX supports 512bit vectors for all types.
 2180   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2181   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2182   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2183     size = (UseAVX > 2) ? 64 : 32;
 2184   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2185     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2186   // Use flag to limit vector size.
 2187   size = MIN2(size,(int)MaxVectorSize);
 2188   // Minimum 2 values in vector (or 4 for bytes).
 2189   switch (bt) {
 2190   case T_DOUBLE:
 2191   case T_LONG:
 2192     if (size < 16) return 0;
 2193     break;
 2194   case T_FLOAT:
 2195   case T_INT:
 2196     if (size < 8) return 0;
 2197     break;
 2198   case T_BOOLEAN:
 2199     if (size < 4) return 0;
 2200     break;
 2201   case T_CHAR:
 2202     if (size < 4) return 0;
 2203     break;
 2204   case T_BYTE:
 2205     if (size < 4) return 0;
 2206     break;
 2207   case T_SHORT:
 2208     if (size < 4) return 0;
 2209     break;
 2210   default:
 2211     ShouldNotReachHere();
 2212   }
 2213   return size;
 2214 }
 2215 
 2216 // Limits on vector size (number of elements) loaded into vector.
 2217 int Matcher::max_vector_size(const BasicType bt) {
 2218   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2219 }
 2220 int Matcher::min_vector_size(const BasicType bt) {
 2221   int max_size = max_vector_size(bt);
 2222   // Min size which can be loaded into vector is 4 bytes.
 2223   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2224   // Support for calling svml double64 vectors
 2225   if (bt == T_DOUBLE) {
 2226     size = 1;
 2227   }
 2228   return MIN2(size,max_size);
 2229 }
 2230 
 2231 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2232   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2233   // by default on Cascade Lake
 2234   if (VM_Version::is_default_intel_cascade_lake()) {
 2235     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2236   }
 2237   return Matcher::max_vector_size(bt);
 2238 }
 2239 
 2240 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2241   return -1;
 2242 }
 2243 
 2244 // Vector ideal reg corresponding to specified size in bytes
 2245 uint Matcher::vector_ideal_reg(int size) {
 2246   assert(MaxVectorSize >= size, "");
 2247   switch(size) {
 2248     case  4: return Op_VecS;
 2249     case  8: return Op_VecD;
 2250     case 16: return Op_VecX;
 2251     case 32: return Op_VecY;
 2252     case 64: return Op_VecZ;
 2253   }
 2254   ShouldNotReachHere();
 2255   return 0;
 2256 }
 2257 
 2258 // Check for shift by small constant as well
 2259 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2260   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2261       shift->in(2)->get_int() <= 3 &&
 2262       // Are there other uses besides address expressions?
 2263       !matcher->is_visited(shift)) {
 2264     address_visited.set(shift->_idx); // Flag as address_visited
 2265     mstack.push(shift->in(2), Matcher::Visit);
 2266     Node *conv = shift->in(1);
 2267     // Allow Matcher to match the rule which bypass
 2268     // ConvI2L operation for an array index on LP64
 2269     // if the index value is positive.
 2270     if (conv->Opcode() == Op_ConvI2L &&
 2271         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2272         // Are there other uses besides address expressions?
 2273         !matcher->is_visited(conv)) {
 2274       address_visited.set(conv->_idx); // Flag as address_visited
 2275       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2276     } else {
 2277       mstack.push(conv, Matcher::Pre_Visit);
 2278     }
 2279     return true;
 2280   }
 2281   return false;
 2282 }
 2283 
 2284 // This function identifies sub-graphs in which a 'load' node is
 2285 // input to two different nodes, and such that it can be matched
 2286 // with BMI instructions like blsi, blsr, etc.
 2287 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2288 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2289 // refers to the same node.
 2290 //
 2291 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2292 // This is a temporary solution until we make DAGs expressible in ADL.
 2293 template<typename ConType>
 2294 class FusedPatternMatcher {
 2295   Node* _op1_node;
 2296   Node* _mop_node;
 2297   int _con_op;
 2298 
 2299   static int match_next(Node* n, int next_op, int next_op_idx) {
 2300     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2301       return -1;
 2302     }
 2303 
 2304     if (next_op_idx == -1) { // n is commutative, try rotations
 2305       if (n->in(1)->Opcode() == next_op) {
 2306         return 1;
 2307       } else if (n->in(2)->Opcode() == next_op) {
 2308         return 2;
 2309       }
 2310     } else {
 2311       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2312       if (n->in(next_op_idx)->Opcode() == next_op) {
 2313         return next_op_idx;
 2314       }
 2315     }
 2316     return -1;
 2317   }
 2318 
 2319  public:
 2320   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2321     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2322 
 2323   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2324              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2325              typename ConType::NativeType con_value) {
 2326     if (_op1_node->Opcode() != op1) {
 2327       return false;
 2328     }
 2329     if (_mop_node->outcnt() > 2) {
 2330       return false;
 2331     }
 2332     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2333     if (op1_op2_idx == -1) {
 2334       return false;
 2335     }
 2336     // Memory operation must be the other edge
 2337     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2338 
 2339     // Check that the mop node is really what we want
 2340     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2341       Node* op2_node = _op1_node->in(op1_op2_idx);
 2342       if (op2_node->outcnt() > 1) {
 2343         return false;
 2344       }
 2345       assert(op2_node->Opcode() == op2, "Should be");
 2346       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2347       if (op2_con_idx == -1) {
 2348         return false;
 2349       }
 2350       // Memory operation must be the other edge
 2351       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2352       // Check that the memory operation is the same node
 2353       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2354         // Now check the constant
 2355         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2356         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2357           return true;
 2358         }
 2359       }
 2360     }
 2361     return false;
 2362   }
 2363 };
 2364 
 2365 static bool is_bmi_pattern(Node* n, Node* m) {
 2366   assert(UseBMI1Instructions, "sanity");
 2367   if (n != nullptr && m != nullptr) {
 2368     if (m->Opcode() == Op_LoadI) {
 2369       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2370       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2371              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2372              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2373     } else if (m->Opcode() == Op_LoadL) {
 2374       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2375       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2376              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2377              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2378     }
 2379   }
 2380   return false;
 2381 }
 2382 
 2383 // Should the matcher clone input 'm' of node 'n'?
 2384 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2385   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2386   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2387     mstack.push(m, Visit);
 2388     return true;
 2389   }
 2390   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2391     mstack.push(m, Visit);           // m = ShiftCntV
 2392     return true;
 2393   }
 2394   if (is_encode_and_store_pattern(n, m)) {
 2395     mstack.push(m, Visit);
 2396     return true;
 2397   }
 2398   return false;
 2399 }
 2400 
 2401 // Should the Matcher clone shifts on addressing modes, expecting them
 2402 // to be subsumed into complex addressing expressions or compute them
 2403 // into registers?
 2404 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2405   Node *off = m->in(AddPNode::Offset);
 2406   if (off->is_Con()) {
 2407     address_visited.test_set(m->_idx); // Flag as address_visited
 2408     Node *adr = m->in(AddPNode::Address);
 2409 
 2410     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2411     // AtomicAdd is not an addressing expression.
 2412     // Cheap to find it by looking for screwy base.
 2413     if (adr->is_AddP() &&
 2414         !adr->in(AddPNode::Base)->is_top() &&
 2415         !adr->in(AddPNode::Offset)->is_Con() &&
 2416         off->get_long() == (int) (off->get_long()) && // immL32
 2417         // Are there other uses besides address expressions?
 2418         !is_visited(adr)) {
 2419       address_visited.set(adr->_idx); // Flag as address_visited
 2420       Node *shift = adr->in(AddPNode::Offset);
 2421       if (!clone_shift(shift, this, mstack, address_visited)) {
 2422         mstack.push(shift, Pre_Visit);
 2423       }
 2424       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2425       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2426     } else {
 2427       mstack.push(adr, Pre_Visit);
 2428     }
 2429 
 2430     // Clone X+offset as it also folds into most addressing expressions
 2431     mstack.push(off, Visit);
 2432     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2433     return true;
 2434   } else if (clone_shift(off, this, mstack, address_visited)) {
 2435     address_visited.test_set(m->_idx); // Flag as address_visited
 2436     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2437     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2438     return true;
 2439   }
 2440   return false;
 2441 }
 2442 
 2443 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2444   switch (bt) {
 2445     case BoolTest::eq:
 2446       return Assembler::eq;
 2447     case BoolTest::ne:
 2448       return Assembler::neq;
 2449     case BoolTest::le:
 2450     case BoolTest::ule:
 2451       return Assembler::le;
 2452     case BoolTest::ge:
 2453     case BoolTest::uge:
 2454       return Assembler::nlt;
 2455     case BoolTest::lt:
 2456     case BoolTest::ult:
 2457       return Assembler::lt;
 2458     case BoolTest::gt:
 2459     case BoolTest::ugt:
 2460       return Assembler::nle;
 2461     default : ShouldNotReachHere(); return Assembler::_false;
 2462   }
 2463 }
 2464 
 2465 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2466   switch (bt) {
 2467   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2468   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2469   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2470   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2471   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2472   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2473   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2474   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2475   }
 2476 }
 2477 
 2478 // Helper methods for MachSpillCopyNode::implementation().
 2479 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2480                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2481   assert(ireg == Op_VecS || // 32bit vector
 2482          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2483           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2484          "no non-adjacent vector moves" );
 2485   if (masm) {
 2486     switch (ireg) {
 2487     case Op_VecS: // copy whole register
 2488     case Op_VecD:
 2489     case Op_VecX:
 2490       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2491         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2492       } else {
 2493         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2494      }
 2495       break;
 2496     case Op_VecY:
 2497       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2498         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2499       } else {
 2500         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2501      }
 2502       break;
 2503     case Op_VecZ:
 2504       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2505       break;
 2506     default:
 2507       ShouldNotReachHere();
 2508     }
 2509 #ifndef PRODUCT
 2510   } else {
 2511     switch (ireg) {
 2512     case Op_VecS:
 2513     case Op_VecD:
 2514     case Op_VecX:
 2515       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2516       break;
 2517     case Op_VecY:
 2518     case Op_VecZ:
 2519       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2520       break;
 2521     default:
 2522       ShouldNotReachHere();
 2523     }
 2524 #endif
 2525   }
 2526 }
 2527 
 2528 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2529                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2530   if (masm) {
 2531     if (is_load) {
 2532       switch (ireg) {
 2533       case Op_VecS:
 2534         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2535         break;
 2536       case Op_VecD:
 2537         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2538         break;
 2539       case Op_VecX:
 2540         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2541           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2542         } else {
 2543           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2544           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2545         }
 2546         break;
 2547       case Op_VecY:
 2548         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2549           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2550         } else {
 2551           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2552           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2553         }
 2554         break;
 2555       case Op_VecZ:
 2556         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2557         break;
 2558       default:
 2559         ShouldNotReachHere();
 2560       }
 2561     } else { // store
 2562       switch (ireg) {
 2563       case Op_VecS:
 2564         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2565         break;
 2566       case Op_VecD:
 2567         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2568         break;
 2569       case Op_VecX:
 2570         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2571           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2572         }
 2573         else {
 2574           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2575         }
 2576         break;
 2577       case Op_VecY:
 2578         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2579           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2580         }
 2581         else {
 2582           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2583         }
 2584         break;
 2585       case Op_VecZ:
 2586         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2587         break;
 2588       default:
 2589         ShouldNotReachHere();
 2590       }
 2591     }
 2592 #ifndef PRODUCT
 2593   } else {
 2594     if (is_load) {
 2595       switch (ireg) {
 2596       case Op_VecS:
 2597         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2598         break;
 2599       case Op_VecD:
 2600         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2601         break;
 2602        case Op_VecX:
 2603         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2604         break;
 2605       case Op_VecY:
 2606       case Op_VecZ:
 2607         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2608         break;
 2609       default:
 2610         ShouldNotReachHere();
 2611       }
 2612     } else { // store
 2613       switch (ireg) {
 2614       case Op_VecS:
 2615         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2616         break;
 2617       case Op_VecD:
 2618         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2619         break;
 2620        case Op_VecX:
 2621         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2622         break;
 2623       case Op_VecY:
 2624       case Op_VecZ:
 2625         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2626         break;
 2627       default:
 2628         ShouldNotReachHere();
 2629       }
 2630     }
 2631 #endif
 2632   }
 2633 }
 2634 
 2635 template <class T>
 2636 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2637   int size = type2aelembytes(bt) * len;
 2638   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2639   for (int i = 0; i < len; i++) {
 2640     int offset = i * type2aelembytes(bt);
 2641     switch (bt) {
 2642       case T_BYTE: val->at(i) = con; break;
 2643       case T_SHORT: {
 2644         jshort c = con;
 2645         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2646         break;
 2647       }
 2648       case T_INT: {
 2649         jint c = con;
 2650         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2651         break;
 2652       }
 2653       case T_LONG: {
 2654         jlong c = con;
 2655         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2656         break;
 2657       }
 2658       case T_FLOAT: {
 2659         jfloat c = con;
 2660         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2661         break;
 2662       }
 2663       case T_DOUBLE: {
 2664         jdouble c = con;
 2665         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2666         break;
 2667       }
 2668       default: assert(false, "%s", type2name(bt));
 2669     }
 2670   }
 2671   return val;
 2672 }
 2673 
 2674 static inline jlong high_bit_set(BasicType bt) {
 2675   switch (bt) {
 2676     case T_BYTE:  return 0x8080808080808080;
 2677     case T_SHORT: return 0x8000800080008000;
 2678     case T_INT:   return 0x8000000080000000;
 2679     case T_LONG:  return 0x8000000000000000;
 2680     default:
 2681       ShouldNotReachHere();
 2682       return 0;
 2683   }
 2684 }
 2685 
 2686 #ifndef PRODUCT
 2687   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2688     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2689   }
 2690 #endif
 2691 
 2692   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2693     __ nop(_count);
 2694   }
 2695 
 2696   uint MachNopNode::size(PhaseRegAlloc*) const {
 2697     return _count;
 2698   }
 2699 
 2700 #ifndef PRODUCT
 2701   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2702     st->print("# breakpoint");
 2703   }
 2704 #endif
 2705 
 2706   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2707     __ int3();
 2708   }
 2709 
 2710   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2711     return MachNode::size(ra_);
 2712   }
 2713 
 2714 %}
 2715 
 2716 encode %{
 2717 
 2718   enc_class call_epilog %{
 2719     if (VerifyStackAtCalls) {
 2720       // Check that stack depth is unchanged: find majik cookie on stack
 2721       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2722       Label L;
 2723       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2724       __ jccb(Assembler::equal, L);
 2725       // Die if stack mismatch
 2726       __ int3();
 2727       __ bind(L);
 2728     }
 2729     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2730       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2731       // Search for the corresponding projection, get the register and emit code that initialized it.
 2732       uint con = (tf()->range_cc()->cnt() - 1);
 2733       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2734         ProjNode* proj = fast_out(i)->as_Proj();
 2735         if (proj->_con == con) {
 2736           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2737           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2738           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2739           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2740           __ testq(rax, rax);
 2741           __ setb(Assembler::notZero, toReg);
 2742           __ movzbl(toReg, toReg);
 2743           if (reg->is_stack()) {
 2744             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2745             __ movq(Address(rsp, st_off), toReg);
 2746           }
 2747           break;
 2748         }
 2749       }
 2750       if (return_value_is_used()) {
 2751         // An inline type is returned as fields in multiple registers.
 2752         // Rax either contains an oop if the inline type is buffered or a pointer
 2753         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2754         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2755         // rax &= (rax & 1) - 1
 2756         __ movptr(rscratch1, rax);
 2757         __ andptr(rscratch1, 0x1);
 2758         __ subptr(rscratch1, 0x1);
 2759         __ andptr(rax, rscratch1);
 2760       }
 2761     }
 2762   %}
 2763 
 2764 %}
 2765 
 2766 // Operands for bound floating pointer register arguments
 2767 operand rxmm0() %{
 2768   constraint(ALLOC_IN_RC(xmm0_reg));
 2769   match(VecX);
 2770   format%{%}
 2771   interface(REG_INTER);
 2772 %}
 2773 
 2774 //----------OPERANDS-----------------------------------------------------------
 2775 // Operand definitions must precede instruction definitions for correct parsing
 2776 // in the ADLC because operands constitute user defined types which are used in
 2777 // instruction definitions.
 2778 
 2779 // Vectors
 2780 
 2781 // Dummy generic vector class. Should be used for all vector operands.
 2782 // Replaced with vec[SDXYZ] during post-selection pass.
 2783 operand vec() %{
 2784   constraint(ALLOC_IN_RC(dynamic));
 2785   match(VecX);
 2786   match(VecY);
 2787   match(VecZ);
 2788   match(VecS);
 2789   match(VecD);
 2790 
 2791   format %{ %}
 2792   interface(REG_INTER);
 2793 %}
 2794 
 2795 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2796 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2797 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2798 // runtime code generation via reg_class_dynamic.
 2799 operand legVec() %{
 2800   constraint(ALLOC_IN_RC(dynamic));
 2801   match(VecX);
 2802   match(VecY);
 2803   match(VecZ);
 2804   match(VecS);
 2805   match(VecD);
 2806 
 2807   format %{ %}
 2808   interface(REG_INTER);
 2809 %}
 2810 
 2811 // Replaces vec during post-selection cleanup. See above.
 2812 operand vecS() %{
 2813   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2814   match(VecS);
 2815 
 2816   format %{ %}
 2817   interface(REG_INTER);
 2818 %}
 2819 
 2820 // Replaces legVec during post-selection cleanup. See above.
 2821 operand legVecS() %{
 2822   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2823   match(VecS);
 2824 
 2825   format %{ %}
 2826   interface(REG_INTER);
 2827 %}
 2828 
 2829 // Replaces vec during post-selection cleanup. See above.
 2830 operand vecD() %{
 2831   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2832   match(VecD);
 2833 
 2834   format %{ %}
 2835   interface(REG_INTER);
 2836 %}
 2837 
 2838 // Replaces legVec during post-selection cleanup. See above.
 2839 operand legVecD() %{
 2840   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2841   match(VecD);
 2842 
 2843   format %{ %}
 2844   interface(REG_INTER);
 2845 %}
 2846 
 2847 // Replaces vec during post-selection cleanup. See above.
 2848 operand vecX() %{
 2849   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2850   match(VecX);
 2851 
 2852   format %{ %}
 2853   interface(REG_INTER);
 2854 %}
 2855 
 2856 // Replaces legVec during post-selection cleanup. See above.
 2857 operand legVecX() %{
 2858   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2859   match(VecX);
 2860 
 2861   format %{ %}
 2862   interface(REG_INTER);
 2863 %}
 2864 
 2865 // Replaces vec during post-selection cleanup. See above.
 2866 operand vecY() %{
 2867   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2868   match(VecY);
 2869 
 2870   format %{ %}
 2871   interface(REG_INTER);
 2872 %}
 2873 
 2874 // Replaces legVec during post-selection cleanup. See above.
 2875 operand legVecY() %{
 2876   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2877   match(VecY);
 2878 
 2879   format %{ %}
 2880   interface(REG_INTER);
 2881 %}
 2882 
 2883 // Replaces vec during post-selection cleanup. See above.
 2884 operand vecZ() %{
 2885   constraint(ALLOC_IN_RC(vectorz_reg));
 2886   match(VecZ);
 2887 
 2888   format %{ %}
 2889   interface(REG_INTER);
 2890 %}
 2891 
 2892 // Replaces legVec during post-selection cleanup. See above.
 2893 operand legVecZ() %{
 2894   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2895   match(VecZ);
 2896 
 2897   format %{ %}
 2898   interface(REG_INTER);
 2899 %}
 2900 
 2901 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2902 
 2903 // ============================================================================
 2904 
 2905 instruct ShouldNotReachHere() %{
 2906   match(Halt);
 2907   format %{ "stop\t# ShouldNotReachHere" %}
 2908   ins_encode %{
 2909     if (is_reachable()) {
 2910       const char* str = __ code_string(_halt_reason);
 2911       __ stop(str);
 2912     }
 2913   %}
 2914   ins_pipe(pipe_slow);
 2915 %}
 2916 
 2917 // ============================================================================
 2918 
 2919 instruct addF_reg(regF dst, regF src) %{
 2920   predicate((UseSSE>=1) && (UseAVX == 0));
 2921   match(Set dst (AddF dst src));
 2922 
 2923   format %{ "addss   $dst, $src" %}
 2924   ins_cost(150);
 2925   ins_encode %{
 2926     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2927   %}
 2928   ins_pipe(pipe_slow);
 2929 %}
 2930 
 2931 instruct addF_mem(regF dst, memory src) %{
 2932   predicate((UseSSE>=1) && (UseAVX == 0));
 2933   match(Set dst (AddF dst (LoadF src)));
 2934 
 2935   format %{ "addss   $dst, $src" %}
 2936   ins_cost(150);
 2937   ins_encode %{
 2938     __ addss($dst$$XMMRegister, $src$$Address);
 2939   %}
 2940   ins_pipe(pipe_slow);
 2941 %}
 2942 
 2943 instruct addF_imm(regF dst, immF con) %{
 2944   predicate((UseSSE>=1) && (UseAVX == 0));
 2945   match(Set dst (AddF dst con));
 2946   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2947   ins_cost(150);
 2948   ins_encode %{
 2949     __ addss($dst$$XMMRegister, $constantaddress($con));
 2950   %}
 2951   ins_pipe(pipe_slow);
 2952 %}
 2953 
 2954 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2955   predicate(UseAVX > 0);
 2956   match(Set dst (AddF src1 src2));
 2957 
 2958   format %{ "vaddss  $dst, $src1, $src2" %}
 2959   ins_cost(150);
 2960   ins_encode %{
 2961     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2962   %}
 2963   ins_pipe(pipe_slow);
 2964 %}
 2965 
 2966 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2967   predicate(UseAVX > 0);
 2968   match(Set dst (AddF src1 (LoadF src2)));
 2969 
 2970   format %{ "vaddss  $dst, $src1, $src2" %}
 2971   ins_cost(150);
 2972   ins_encode %{
 2973     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2974   %}
 2975   ins_pipe(pipe_slow);
 2976 %}
 2977 
 2978 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2979   predicate(UseAVX > 0);
 2980   match(Set dst (AddF src con));
 2981 
 2982   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2983   ins_cost(150);
 2984   ins_encode %{
 2985     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2986   %}
 2987   ins_pipe(pipe_slow);
 2988 %}
 2989 
 2990 instruct addD_reg(regD dst, regD src) %{
 2991   predicate((UseSSE>=2) && (UseAVX == 0));
 2992   match(Set dst (AddD dst src));
 2993 
 2994   format %{ "addsd   $dst, $src" %}
 2995   ins_cost(150);
 2996   ins_encode %{
 2997     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2998   %}
 2999   ins_pipe(pipe_slow);
 3000 %}
 3001 
 3002 instruct addD_mem(regD dst, memory src) %{
 3003   predicate((UseSSE>=2) && (UseAVX == 0));
 3004   match(Set dst (AddD dst (LoadD src)));
 3005 
 3006   format %{ "addsd   $dst, $src" %}
 3007   ins_cost(150);
 3008   ins_encode %{
 3009     __ addsd($dst$$XMMRegister, $src$$Address);
 3010   %}
 3011   ins_pipe(pipe_slow);
 3012 %}
 3013 
 3014 instruct addD_imm(regD dst, immD con) %{
 3015   predicate((UseSSE>=2) && (UseAVX == 0));
 3016   match(Set dst (AddD dst con));
 3017   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3018   ins_cost(150);
 3019   ins_encode %{
 3020     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3021   %}
 3022   ins_pipe(pipe_slow);
 3023 %}
 3024 
 3025 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3026   predicate(UseAVX > 0);
 3027   match(Set dst (AddD src1 src2));
 3028 
 3029   format %{ "vaddsd  $dst, $src1, $src2" %}
 3030   ins_cost(150);
 3031   ins_encode %{
 3032     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3033   %}
 3034   ins_pipe(pipe_slow);
 3035 %}
 3036 
 3037 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3038   predicate(UseAVX > 0);
 3039   match(Set dst (AddD src1 (LoadD src2)));
 3040 
 3041   format %{ "vaddsd  $dst, $src1, $src2" %}
 3042   ins_cost(150);
 3043   ins_encode %{
 3044     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3045   %}
 3046   ins_pipe(pipe_slow);
 3047 %}
 3048 
 3049 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3050   predicate(UseAVX > 0);
 3051   match(Set dst (AddD src con));
 3052 
 3053   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3054   ins_cost(150);
 3055   ins_encode %{
 3056     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3057   %}
 3058   ins_pipe(pipe_slow);
 3059 %}
 3060 
 3061 instruct subF_reg(regF dst, regF src) %{
 3062   predicate((UseSSE>=1) && (UseAVX == 0));
 3063   match(Set dst (SubF dst src));
 3064 
 3065   format %{ "subss   $dst, $src" %}
 3066   ins_cost(150);
 3067   ins_encode %{
 3068     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3069   %}
 3070   ins_pipe(pipe_slow);
 3071 %}
 3072 
 3073 instruct subF_mem(regF dst, memory src) %{
 3074   predicate((UseSSE>=1) && (UseAVX == 0));
 3075   match(Set dst (SubF dst (LoadF src)));
 3076 
 3077   format %{ "subss   $dst, $src" %}
 3078   ins_cost(150);
 3079   ins_encode %{
 3080     __ subss($dst$$XMMRegister, $src$$Address);
 3081   %}
 3082   ins_pipe(pipe_slow);
 3083 %}
 3084 
 3085 instruct subF_imm(regF dst, immF con) %{
 3086   predicate((UseSSE>=1) && (UseAVX == 0));
 3087   match(Set dst (SubF dst con));
 3088   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3089   ins_cost(150);
 3090   ins_encode %{
 3091     __ subss($dst$$XMMRegister, $constantaddress($con));
 3092   %}
 3093   ins_pipe(pipe_slow);
 3094 %}
 3095 
 3096 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3097   predicate(UseAVX > 0);
 3098   match(Set dst (SubF src1 src2));
 3099 
 3100   format %{ "vsubss  $dst, $src1, $src2" %}
 3101   ins_cost(150);
 3102   ins_encode %{
 3103     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3104   %}
 3105   ins_pipe(pipe_slow);
 3106 %}
 3107 
 3108 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3109   predicate(UseAVX > 0);
 3110   match(Set dst (SubF src1 (LoadF src2)));
 3111 
 3112   format %{ "vsubss  $dst, $src1, $src2" %}
 3113   ins_cost(150);
 3114   ins_encode %{
 3115     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3116   %}
 3117   ins_pipe(pipe_slow);
 3118 %}
 3119 
 3120 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3121   predicate(UseAVX > 0);
 3122   match(Set dst (SubF src con));
 3123 
 3124   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3125   ins_cost(150);
 3126   ins_encode %{
 3127     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3128   %}
 3129   ins_pipe(pipe_slow);
 3130 %}
 3131 
 3132 instruct subD_reg(regD dst, regD src) %{
 3133   predicate((UseSSE>=2) && (UseAVX == 0));
 3134   match(Set dst (SubD dst src));
 3135 
 3136   format %{ "subsd   $dst, $src" %}
 3137   ins_cost(150);
 3138   ins_encode %{
 3139     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3140   %}
 3141   ins_pipe(pipe_slow);
 3142 %}
 3143 
 3144 instruct subD_mem(regD dst, memory src) %{
 3145   predicate((UseSSE>=2) && (UseAVX == 0));
 3146   match(Set dst (SubD dst (LoadD src)));
 3147 
 3148   format %{ "subsd   $dst, $src" %}
 3149   ins_cost(150);
 3150   ins_encode %{
 3151     __ subsd($dst$$XMMRegister, $src$$Address);
 3152   %}
 3153   ins_pipe(pipe_slow);
 3154 %}
 3155 
 3156 instruct subD_imm(regD dst, immD con) %{
 3157   predicate((UseSSE>=2) && (UseAVX == 0));
 3158   match(Set dst (SubD dst con));
 3159   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3160   ins_cost(150);
 3161   ins_encode %{
 3162     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3163   %}
 3164   ins_pipe(pipe_slow);
 3165 %}
 3166 
 3167 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3168   predicate(UseAVX > 0);
 3169   match(Set dst (SubD src1 src2));
 3170 
 3171   format %{ "vsubsd  $dst, $src1, $src2" %}
 3172   ins_cost(150);
 3173   ins_encode %{
 3174     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3175   %}
 3176   ins_pipe(pipe_slow);
 3177 %}
 3178 
 3179 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3180   predicate(UseAVX > 0);
 3181   match(Set dst (SubD src1 (LoadD src2)));
 3182 
 3183   format %{ "vsubsd  $dst, $src1, $src2" %}
 3184   ins_cost(150);
 3185   ins_encode %{
 3186     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3187   %}
 3188   ins_pipe(pipe_slow);
 3189 %}
 3190 
 3191 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3192   predicate(UseAVX > 0);
 3193   match(Set dst (SubD src con));
 3194 
 3195   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3196   ins_cost(150);
 3197   ins_encode %{
 3198     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3199   %}
 3200   ins_pipe(pipe_slow);
 3201 %}
 3202 
 3203 instruct mulF_reg(regF dst, regF src) %{
 3204   predicate((UseSSE>=1) && (UseAVX == 0));
 3205   match(Set dst (MulF dst src));
 3206 
 3207   format %{ "mulss   $dst, $src" %}
 3208   ins_cost(150);
 3209   ins_encode %{
 3210     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3211   %}
 3212   ins_pipe(pipe_slow);
 3213 %}
 3214 
 3215 instruct mulF_mem(regF dst, memory src) %{
 3216   predicate((UseSSE>=1) && (UseAVX == 0));
 3217   match(Set dst (MulF dst (LoadF src)));
 3218 
 3219   format %{ "mulss   $dst, $src" %}
 3220   ins_cost(150);
 3221   ins_encode %{
 3222     __ mulss($dst$$XMMRegister, $src$$Address);
 3223   %}
 3224   ins_pipe(pipe_slow);
 3225 %}
 3226 
 3227 instruct mulF_imm(regF dst, immF con) %{
 3228   predicate((UseSSE>=1) && (UseAVX == 0));
 3229   match(Set dst (MulF dst con));
 3230   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3231   ins_cost(150);
 3232   ins_encode %{
 3233     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3234   %}
 3235   ins_pipe(pipe_slow);
 3236 %}
 3237 
 3238 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3239   predicate(UseAVX > 0);
 3240   match(Set dst (MulF src1 src2));
 3241 
 3242   format %{ "vmulss  $dst, $src1, $src2" %}
 3243   ins_cost(150);
 3244   ins_encode %{
 3245     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3246   %}
 3247   ins_pipe(pipe_slow);
 3248 %}
 3249 
 3250 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3251   predicate(UseAVX > 0);
 3252   match(Set dst (MulF src1 (LoadF src2)));
 3253 
 3254   format %{ "vmulss  $dst, $src1, $src2" %}
 3255   ins_cost(150);
 3256   ins_encode %{
 3257     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3258   %}
 3259   ins_pipe(pipe_slow);
 3260 %}
 3261 
 3262 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3263   predicate(UseAVX > 0);
 3264   match(Set dst (MulF src con));
 3265 
 3266   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3267   ins_cost(150);
 3268   ins_encode %{
 3269     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3270   %}
 3271   ins_pipe(pipe_slow);
 3272 %}
 3273 
 3274 instruct mulD_reg(regD dst, regD src) %{
 3275   predicate((UseSSE>=2) && (UseAVX == 0));
 3276   match(Set dst (MulD dst src));
 3277 
 3278   format %{ "mulsd   $dst, $src" %}
 3279   ins_cost(150);
 3280   ins_encode %{
 3281     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3282   %}
 3283   ins_pipe(pipe_slow);
 3284 %}
 3285 
 3286 instruct mulD_mem(regD dst, memory src) %{
 3287   predicate((UseSSE>=2) && (UseAVX == 0));
 3288   match(Set dst (MulD dst (LoadD src)));
 3289 
 3290   format %{ "mulsd   $dst, $src" %}
 3291   ins_cost(150);
 3292   ins_encode %{
 3293     __ mulsd($dst$$XMMRegister, $src$$Address);
 3294   %}
 3295   ins_pipe(pipe_slow);
 3296 %}
 3297 
 3298 instruct mulD_imm(regD dst, immD con) %{
 3299   predicate((UseSSE>=2) && (UseAVX == 0));
 3300   match(Set dst (MulD dst con));
 3301   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3302   ins_cost(150);
 3303   ins_encode %{
 3304     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3305   %}
 3306   ins_pipe(pipe_slow);
 3307 %}
 3308 
 3309 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3310   predicate(UseAVX > 0);
 3311   match(Set dst (MulD src1 src2));
 3312 
 3313   format %{ "vmulsd  $dst, $src1, $src2" %}
 3314   ins_cost(150);
 3315   ins_encode %{
 3316     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3317   %}
 3318   ins_pipe(pipe_slow);
 3319 %}
 3320 
 3321 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3322   predicate(UseAVX > 0);
 3323   match(Set dst (MulD src1 (LoadD src2)));
 3324 
 3325   format %{ "vmulsd  $dst, $src1, $src2" %}
 3326   ins_cost(150);
 3327   ins_encode %{
 3328     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3329   %}
 3330   ins_pipe(pipe_slow);
 3331 %}
 3332 
 3333 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3334   predicate(UseAVX > 0);
 3335   match(Set dst (MulD src con));
 3336 
 3337   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3338   ins_cost(150);
 3339   ins_encode %{
 3340     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3341   %}
 3342   ins_pipe(pipe_slow);
 3343 %}
 3344 
 3345 instruct divF_reg(regF dst, regF src) %{
 3346   predicate((UseSSE>=1) && (UseAVX == 0));
 3347   match(Set dst (DivF dst src));
 3348 
 3349   format %{ "divss   $dst, $src" %}
 3350   ins_cost(150);
 3351   ins_encode %{
 3352     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3353   %}
 3354   ins_pipe(pipe_slow);
 3355 %}
 3356 
 3357 instruct divF_mem(regF dst, memory src) %{
 3358   predicate((UseSSE>=1) && (UseAVX == 0));
 3359   match(Set dst (DivF dst (LoadF src)));
 3360 
 3361   format %{ "divss   $dst, $src" %}
 3362   ins_cost(150);
 3363   ins_encode %{
 3364     __ divss($dst$$XMMRegister, $src$$Address);
 3365   %}
 3366   ins_pipe(pipe_slow);
 3367 %}
 3368 
 3369 instruct divF_imm(regF dst, immF con) %{
 3370   predicate((UseSSE>=1) && (UseAVX == 0));
 3371   match(Set dst (DivF dst con));
 3372   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3373   ins_cost(150);
 3374   ins_encode %{
 3375     __ divss($dst$$XMMRegister, $constantaddress($con));
 3376   %}
 3377   ins_pipe(pipe_slow);
 3378 %}
 3379 
 3380 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3381   predicate(UseAVX > 0);
 3382   match(Set dst (DivF src1 src2));
 3383 
 3384   format %{ "vdivss  $dst, $src1, $src2" %}
 3385   ins_cost(150);
 3386   ins_encode %{
 3387     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3388   %}
 3389   ins_pipe(pipe_slow);
 3390 %}
 3391 
 3392 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3393   predicate(UseAVX > 0);
 3394   match(Set dst (DivF src1 (LoadF src2)));
 3395 
 3396   format %{ "vdivss  $dst, $src1, $src2" %}
 3397   ins_cost(150);
 3398   ins_encode %{
 3399     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3400   %}
 3401   ins_pipe(pipe_slow);
 3402 %}
 3403 
 3404 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3405   predicate(UseAVX > 0);
 3406   match(Set dst (DivF src con));
 3407 
 3408   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3409   ins_cost(150);
 3410   ins_encode %{
 3411     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3412   %}
 3413   ins_pipe(pipe_slow);
 3414 %}
 3415 
 3416 instruct divD_reg(regD dst, regD src) %{
 3417   predicate((UseSSE>=2) && (UseAVX == 0));
 3418   match(Set dst (DivD dst src));
 3419 
 3420   format %{ "divsd   $dst, $src" %}
 3421   ins_cost(150);
 3422   ins_encode %{
 3423     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3424   %}
 3425   ins_pipe(pipe_slow);
 3426 %}
 3427 
 3428 instruct divD_mem(regD dst, memory src) %{
 3429   predicate((UseSSE>=2) && (UseAVX == 0));
 3430   match(Set dst (DivD dst (LoadD src)));
 3431 
 3432   format %{ "divsd   $dst, $src" %}
 3433   ins_cost(150);
 3434   ins_encode %{
 3435     __ divsd($dst$$XMMRegister, $src$$Address);
 3436   %}
 3437   ins_pipe(pipe_slow);
 3438 %}
 3439 
 3440 instruct divD_imm(regD dst, immD con) %{
 3441   predicate((UseSSE>=2) && (UseAVX == 0));
 3442   match(Set dst (DivD dst con));
 3443   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3444   ins_cost(150);
 3445   ins_encode %{
 3446     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3447   %}
 3448   ins_pipe(pipe_slow);
 3449 %}
 3450 
 3451 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3452   predicate(UseAVX > 0);
 3453   match(Set dst (DivD src1 src2));
 3454 
 3455   format %{ "vdivsd  $dst, $src1, $src2" %}
 3456   ins_cost(150);
 3457   ins_encode %{
 3458     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3459   %}
 3460   ins_pipe(pipe_slow);
 3461 %}
 3462 
 3463 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3464   predicate(UseAVX > 0);
 3465   match(Set dst (DivD src1 (LoadD src2)));
 3466 
 3467   format %{ "vdivsd  $dst, $src1, $src2" %}
 3468   ins_cost(150);
 3469   ins_encode %{
 3470     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3471   %}
 3472   ins_pipe(pipe_slow);
 3473 %}
 3474 
 3475 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3476   predicate(UseAVX > 0);
 3477   match(Set dst (DivD src con));
 3478 
 3479   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3480   ins_cost(150);
 3481   ins_encode %{
 3482     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3483   %}
 3484   ins_pipe(pipe_slow);
 3485 %}
 3486 
 3487 instruct absF_reg(regF dst) %{
 3488   predicate((UseSSE>=1) && (UseAVX == 0));
 3489   match(Set dst (AbsF dst));
 3490   ins_cost(150);
 3491   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3492   ins_encode %{
 3493     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3494   %}
 3495   ins_pipe(pipe_slow);
 3496 %}
 3497 
 3498 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3499   predicate(UseAVX > 0);
 3500   match(Set dst (AbsF src));
 3501   ins_cost(150);
 3502   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3503   ins_encode %{
 3504     int vlen_enc = Assembler::AVX_128bit;
 3505     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3506               ExternalAddress(float_signmask()), vlen_enc);
 3507   %}
 3508   ins_pipe(pipe_slow);
 3509 %}
 3510 
 3511 instruct absD_reg(regD dst) %{
 3512   predicate((UseSSE>=2) && (UseAVX == 0));
 3513   match(Set dst (AbsD dst));
 3514   ins_cost(150);
 3515   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3516             "# abs double by sign masking" %}
 3517   ins_encode %{
 3518     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3519   %}
 3520   ins_pipe(pipe_slow);
 3521 %}
 3522 
 3523 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3524   predicate(UseAVX > 0);
 3525   match(Set dst (AbsD src));
 3526   ins_cost(150);
 3527   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3528             "# abs double by sign masking" %}
 3529   ins_encode %{
 3530     int vlen_enc = Assembler::AVX_128bit;
 3531     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3532               ExternalAddress(double_signmask()), vlen_enc);
 3533   %}
 3534   ins_pipe(pipe_slow);
 3535 %}
 3536 
 3537 instruct negF_reg(regF dst) %{
 3538   predicate((UseSSE>=1) && (UseAVX == 0));
 3539   match(Set dst (NegF dst));
 3540   ins_cost(150);
 3541   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3542   ins_encode %{
 3543     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3544   %}
 3545   ins_pipe(pipe_slow);
 3546 %}
 3547 
 3548 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3549   predicate(UseAVX > 0);
 3550   match(Set dst (NegF src));
 3551   ins_cost(150);
 3552   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3553   ins_encode %{
 3554     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3555                  ExternalAddress(float_signflip()));
 3556   %}
 3557   ins_pipe(pipe_slow);
 3558 %}
 3559 
 3560 instruct negD_reg(regD dst) %{
 3561   predicate((UseSSE>=2) && (UseAVX == 0));
 3562   match(Set dst (NegD dst));
 3563   ins_cost(150);
 3564   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3565             "# neg double by sign flipping" %}
 3566   ins_encode %{
 3567     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3568   %}
 3569   ins_pipe(pipe_slow);
 3570 %}
 3571 
 3572 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3573   predicate(UseAVX > 0);
 3574   match(Set dst (NegD src));
 3575   ins_cost(150);
 3576   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3577             "# neg double by sign flipping" %}
 3578   ins_encode %{
 3579     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3580                  ExternalAddress(double_signflip()));
 3581   %}
 3582   ins_pipe(pipe_slow);
 3583 %}
 3584 
 3585 // sqrtss instruction needs destination register to be pre initialized for best performance
 3586 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3587 instruct sqrtF_reg(regF dst) %{
 3588   predicate(UseSSE>=1);
 3589   match(Set dst (SqrtF dst));
 3590   format %{ "sqrtss  $dst, $dst" %}
 3591   ins_encode %{
 3592     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3593   %}
 3594   ins_pipe(pipe_slow);
 3595 %}
 3596 
 3597 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3598 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3599 instruct sqrtD_reg(regD dst) %{
 3600   predicate(UseSSE>=2);
 3601   match(Set dst (SqrtD dst));
 3602   format %{ "sqrtsd  $dst, $dst" %}
 3603   ins_encode %{
 3604     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3605   %}
 3606   ins_pipe(pipe_slow);
 3607 %}
 3608 
 3609 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3610   effect(TEMP tmp);
 3611   match(Set dst (ConvF2HF src));
 3612   ins_cost(125);
 3613   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3614   ins_encode %{
 3615     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3616   %}
 3617   ins_pipe( pipe_slow );
 3618 %}
 3619 
 3620 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3621   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3622   effect(TEMP ktmp, TEMP rtmp);
 3623   match(Set mem (StoreC mem (ConvF2HF src)));
 3624   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3625   ins_encode %{
 3626     __ movl($rtmp$$Register, 0x1);
 3627     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3628     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3629   %}
 3630   ins_pipe( pipe_slow );
 3631 %}
 3632 
 3633 instruct vconvF2HF(vec dst, vec src) %{
 3634   match(Set dst (VectorCastF2HF src));
 3635   format %{ "vector_conv_F2HF $dst $src" %}
 3636   ins_encode %{
 3637     int vlen_enc = vector_length_encoding(this, $src);
 3638     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3639   %}
 3640   ins_pipe( pipe_slow );
 3641 %}
 3642 
 3643 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3644   predicate(n->as_StoreVector()->memory_size() >= 16);
 3645   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3646   format %{ "vcvtps2ph $mem,$src" %}
 3647   ins_encode %{
 3648     int vlen_enc = vector_length_encoding(this, $src);
 3649     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3650   %}
 3651   ins_pipe( pipe_slow );
 3652 %}
 3653 
 3654 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3655   match(Set dst (ConvHF2F src));
 3656   format %{ "vcvtph2ps $dst,$src" %}
 3657   ins_encode %{
 3658     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3659   %}
 3660   ins_pipe( pipe_slow );
 3661 %}
 3662 
 3663 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3664   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3665   format %{ "vcvtph2ps $dst,$mem" %}
 3666   ins_encode %{
 3667     int vlen_enc = vector_length_encoding(this);
 3668     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3669   %}
 3670   ins_pipe( pipe_slow );
 3671 %}
 3672 
 3673 instruct vconvHF2F(vec dst, vec src) %{
 3674   match(Set dst (VectorCastHF2F src));
 3675   ins_cost(125);
 3676   format %{ "vector_conv_HF2F $dst,$src" %}
 3677   ins_encode %{
 3678     int vlen_enc = vector_length_encoding(this);
 3679     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3680   %}
 3681   ins_pipe( pipe_slow );
 3682 %}
 3683 
 3684 // ---------------------------------------- VectorReinterpret ------------------------------------
 3685 instruct reinterpret_mask(kReg dst) %{
 3686   predicate(n->bottom_type()->isa_vectmask() &&
 3687             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3688   match(Set dst (VectorReinterpret dst));
 3689   ins_cost(125);
 3690   format %{ "vector_reinterpret $dst\t!" %}
 3691   ins_encode %{
 3692     // empty
 3693   %}
 3694   ins_pipe( pipe_slow );
 3695 %}
 3696 
 3697 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3698   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3699             n->bottom_type()->isa_vectmask() &&
 3700             n->in(1)->bottom_type()->isa_vectmask() &&
 3701             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3702             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3703   match(Set dst (VectorReinterpret src));
 3704   effect(TEMP xtmp);
 3705   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3706   ins_encode %{
 3707      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3708      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3709      assert(src_sz == dst_sz , "src and dst size mismatch");
 3710      int vlen_enc = vector_length_encoding(src_sz);
 3711      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3712      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3713   %}
 3714   ins_pipe( pipe_slow );
 3715 %}
 3716 
 3717 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3718   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3719             n->bottom_type()->isa_vectmask() &&
 3720             n->in(1)->bottom_type()->isa_vectmask() &&
 3721             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3722              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3723             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3724   match(Set dst (VectorReinterpret src));
 3725   effect(TEMP xtmp);
 3726   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3727   ins_encode %{
 3728      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3729      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3730      assert(src_sz == dst_sz , "src and dst size mismatch");
 3731      int vlen_enc = vector_length_encoding(src_sz);
 3732      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3733      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3734   %}
 3735   ins_pipe( pipe_slow );
 3736 %}
 3737 
 3738 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3739   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3740             n->bottom_type()->isa_vectmask() &&
 3741             n->in(1)->bottom_type()->isa_vectmask() &&
 3742             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3743              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3744             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3745   match(Set dst (VectorReinterpret src));
 3746   effect(TEMP xtmp);
 3747   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3748   ins_encode %{
 3749      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3750      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3751      assert(src_sz == dst_sz , "src and dst size mismatch");
 3752      int vlen_enc = vector_length_encoding(src_sz);
 3753      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3754      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3755   %}
 3756   ins_pipe( pipe_slow );
 3757 %}
 3758 
 3759 instruct reinterpret(vec dst) %{
 3760   predicate(!n->bottom_type()->isa_vectmask() &&
 3761             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3762   match(Set dst (VectorReinterpret dst));
 3763   ins_cost(125);
 3764   format %{ "vector_reinterpret $dst\t!" %}
 3765   ins_encode %{
 3766     // empty
 3767   %}
 3768   ins_pipe( pipe_slow );
 3769 %}
 3770 
 3771 instruct reinterpret_expand(vec dst, vec src) %{
 3772   predicate(UseAVX == 0 &&
 3773             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3774   match(Set dst (VectorReinterpret src));
 3775   ins_cost(125);
 3776   effect(TEMP dst);
 3777   format %{ "vector_reinterpret_expand $dst,$src" %}
 3778   ins_encode %{
 3779     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3780     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3781 
 3782     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3783     if (src_vlen_in_bytes == 4) {
 3784       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3785     } else {
 3786       assert(src_vlen_in_bytes == 8, "");
 3787       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3788     }
 3789     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3790   %}
 3791   ins_pipe( pipe_slow );
 3792 %}
 3793 
 3794 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3795   predicate(UseAVX > 0 &&
 3796             !n->bottom_type()->isa_vectmask() &&
 3797             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3798             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3799   match(Set dst (VectorReinterpret src));
 3800   ins_cost(125);
 3801   format %{ "vector_reinterpret_expand $dst,$src" %}
 3802   ins_encode %{
 3803     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3804   %}
 3805   ins_pipe( pipe_slow );
 3806 %}
 3807 
 3808 
 3809 instruct vreinterpret_expand(legVec dst, vec src) %{
 3810   predicate(UseAVX > 0 &&
 3811             !n->bottom_type()->isa_vectmask() &&
 3812             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3813             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3814   match(Set dst (VectorReinterpret src));
 3815   ins_cost(125);
 3816   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3817   ins_encode %{
 3818     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3819       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3820       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3821       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3822       default: ShouldNotReachHere();
 3823     }
 3824   %}
 3825   ins_pipe( pipe_slow );
 3826 %}
 3827 
 3828 instruct reinterpret_shrink(vec dst, legVec src) %{
 3829   predicate(!n->bottom_type()->isa_vectmask() &&
 3830             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3831   match(Set dst (VectorReinterpret src));
 3832   ins_cost(125);
 3833   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3834   ins_encode %{
 3835     switch (Matcher::vector_length_in_bytes(this)) {
 3836       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3837       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3838       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3839       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3840       default: ShouldNotReachHere();
 3841     }
 3842   %}
 3843   ins_pipe( pipe_slow );
 3844 %}
 3845 
 3846 // ----------------------------------------------------------------------------------------------------
 3847 
 3848 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3849   match(Set dst (RoundDoubleMode src rmode));
 3850   format %{ "roundsd $dst,$src" %}
 3851   ins_cost(150);
 3852   ins_encode %{
 3853     assert(UseSSE >= 4, "required");
 3854     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3855       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3856     }
 3857     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3858   %}
 3859   ins_pipe(pipe_slow);
 3860 %}
 3861 
 3862 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3863   match(Set dst (RoundDoubleMode con rmode));
 3864   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3865   ins_cost(150);
 3866   ins_encode %{
 3867     assert(UseSSE >= 4, "required");
 3868     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3869   %}
 3870   ins_pipe(pipe_slow);
 3871 %}
 3872 
 3873 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3874   predicate(Matcher::vector_length(n) < 8);
 3875   match(Set dst (RoundDoubleModeV src rmode));
 3876   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3877   ins_encode %{
 3878     assert(UseAVX > 0, "required");
 3879     int vlen_enc = vector_length_encoding(this);
 3880     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3881   %}
 3882   ins_pipe( pipe_slow );
 3883 %}
 3884 
 3885 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3886   predicate(Matcher::vector_length(n) == 8);
 3887   match(Set dst (RoundDoubleModeV src rmode));
 3888   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3889   ins_encode %{
 3890     assert(UseAVX > 2, "required");
 3891     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3892   %}
 3893   ins_pipe( pipe_slow );
 3894 %}
 3895 
 3896 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3897   predicate(Matcher::vector_length(n) < 8);
 3898   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3899   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3900   ins_encode %{
 3901     assert(UseAVX > 0, "required");
 3902     int vlen_enc = vector_length_encoding(this);
 3903     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3904   %}
 3905   ins_pipe( pipe_slow );
 3906 %}
 3907 
 3908 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3909   predicate(Matcher::vector_length(n) == 8);
 3910   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3911   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3912   ins_encode %{
 3913     assert(UseAVX > 2, "required");
 3914     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3915   %}
 3916   ins_pipe( pipe_slow );
 3917 %}
 3918 
 3919 instruct onspinwait() %{
 3920   match(OnSpinWait);
 3921   ins_cost(200);
 3922 
 3923   format %{
 3924     $$template
 3925     $$emit$$"pause\t! membar_onspinwait"
 3926   %}
 3927   ins_encode %{
 3928     __ pause();
 3929   %}
 3930   ins_pipe(pipe_slow);
 3931 %}
 3932 
 3933 // a * b + c
 3934 instruct fmaD_reg(regD a, regD b, regD c) %{
 3935   match(Set c (FmaD  c (Binary a b)));
 3936   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3937   ins_cost(150);
 3938   ins_encode %{
 3939     assert(UseFMA, "Needs FMA instructions support.");
 3940     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3941   %}
 3942   ins_pipe( pipe_slow );
 3943 %}
 3944 
 3945 // a * b + c
 3946 instruct fmaF_reg(regF a, regF b, regF c) %{
 3947   match(Set c (FmaF  c (Binary a b)));
 3948   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3949   ins_cost(150);
 3950   ins_encode %{
 3951     assert(UseFMA, "Needs FMA instructions support.");
 3952     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3953   %}
 3954   ins_pipe( pipe_slow );
 3955 %}
 3956 
 3957 // ====================VECTOR INSTRUCTIONS=====================================
 3958 
 3959 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3960 instruct MoveVec2Leg(legVec dst, vec src) %{
 3961   match(Set dst src);
 3962   format %{ "" %}
 3963   ins_encode %{
 3964     ShouldNotReachHere();
 3965   %}
 3966   ins_pipe( fpu_reg_reg );
 3967 %}
 3968 
 3969 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3970   match(Set dst src);
 3971   format %{ "" %}
 3972   ins_encode %{
 3973     ShouldNotReachHere();
 3974   %}
 3975   ins_pipe( fpu_reg_reg );
 3976 %}
 3977 
 3978 // ============================================================================
 3979 
 3980 // Load vectors generic operand pattern
 3981 instruct loadV(vec dst, memory mem) %{
 3982   match(Set dst (LoadVector mem));
 3983   ins_cost(125);
 3984   format %{ "load_vector $dst,$mem" %}
 3985   ins_encode %{
 3986     BasicType bt = Matcher::vector_element_basic_type(this);
 3987     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3988   %}
 3989   ins_pipe( pipe_slow );
 3990 %}
 3991 
 3992 // Store vectors generic operand pattern.
 3993 instruct storeV(memory mem, vec src) %{
 3994   match(Set mem (StoreVector mem src));
 3995   ins_cost(145);
 3996   format %{ "store_vector $mem,$src\n\t" %}
 3997   ins_encode %{
 3998     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3999       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4000       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4001       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4002       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4003       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4004       default: ShouldNotReachHere();
 4005     }
 4006   %}
 4007   ins_pipe( pipe_slow );
 4008 %}
 4009 
 4010 // ---------------------------------------- Gather ------------------------------------
 4011 
 4012 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4013 
 4014 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4015   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4016             Matcher::vector_length_in_bytes(n) <= 32);
 4017   match(Set dst (LoadVectorGather mem idx));
 4018   effect(TEMP dst, TEMP tmp, TEMP mask);
 4019   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4020   ins_encode %{
 4021     int vlen_enc = vector_length_encoding(this);
 4022     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4023     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4024     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4025     __ lea($tmp$$Register, $mem$$Address);
 4026     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4027   %}
 4028   ins_pipe( pipe_slow );
 4029 %}
 4030 
 4031 
 4032 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4033   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4034             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4035   match(Set dst (LoadVectorGather mem idx));
 4036   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4037   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4038   ins_encode %{
 4039     int vlen_enc = vector_length_encoding(this);
 4040     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4041     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4042     __ lea($tmp$$Register, $mem$$Address);
 4043     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4044   %}
 4045   ins_pipe( pipe_slow );
 4046 %}
 4047 
 4048 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4049   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4050             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4051   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4052   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4053   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4054   ins_encode %{
 4055     assert(UseAVX > 2, "sanity");
 4056     int vlen_enc = vector_length_encoding(this);
 4057     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4058     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4059     // Note: Since gather instruction partially updates the opmask register used
 4060     // for predication hense moving mask operand to a temporary.
 4061     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4062     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4063     __ lea($tmp$$Register, $mem$$Address);
 4064     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4065   %}
 4066   ins_pipe( pipe_slow );
 4067 %}
 4068 
 4069 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4070   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4071   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4072   effect(TEMP tmp, TEMP rtmp);
 4073   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4074   ins_encode %{
 4075     int vlen_enc = vector_length_encoding(this);
 4076     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4077     __ lea($tmp$$Register, $mem$$Address);
 4078     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4079   %}
 4080   ins_pipe( pipe_slow );
 4081 %}
 4082 
 4083 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4084                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4085   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4086   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4087   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4088   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4089   ins_encode %{
 4090     int vlen_enc = vector_length_encoding(this);
 4091     int vector_len = Matcher::vector_length(this);
 4092     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4093     __ lea($tmp$$Register, $mem$$Address);
 4094     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4095     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4096                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4097   %}
 4098   ins_pipe( pipe_slow );
 4099 %}
 4100 
 4101 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4102   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4103   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4104   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4105   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4106   ins_encode %{
 4107     int vlen_enc = vector_length_encoding(this);
 4108     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4109     __ lea($tmp$$Register, $mem$$Address);
 4110     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4111   %}
 4112   ins_pipe( pipe_slow );
 4113 %}
 4114 
 4115 
 4116 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4117                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4118   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4119   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4120   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4121   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4122   ins_encode %{
 4123     int vlen_enc = vector_length_encoding(this);
 4124     int vector_len = Matcher::vector_length(this);
 4125     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4126     __ lea($tmp$$Register, $mem$$Address);
 4127     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4128     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4129                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4130   %}
 4131   ins_pipe( pipe_slow );
 4132 %}
 4133 
 4134 
 4135 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4136   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4137   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4138   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4139   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4140   ins_encode %{
 4141     int vlen_enc = vector_length_encoding(this);
 4142     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4143     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4144     __ lea($tmp$$Register, $mem$$Address);
 4145     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4146     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4147   %}
 4148   ins_pipe( pipe_slow );
 4149 %}
 4150 
 4151 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4152                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4153   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4154   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4155   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4156   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4157   ins_encode %{
 4158     int vlen_enc = vector_length_encoding(this);
 4159     int vector_len = Matcher::vector_length(this);
 4160     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4161     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4162     __ lea($tmp$$Register, $mem$$Address);
 4163     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4164     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4165     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4166                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4167   %}
 4168   ins_pipe( pipe_slow );
 4169 %}
 4170 
 4171 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4172   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4173   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4174   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4175   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4176   ins_encode %{
 4177     int vlen_enc = vector_length_encoding(this);
 4178     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4179     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4180     __ lea($tmp$$Register, $mem$$Address);
 4181     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4182     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4183                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4184   %}
 4185   ins_pipe( pipe_slow );
 4186 %}
 4187 
 4188 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4189                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4190   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4191   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4192   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4193   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4194   ins_encode %{
 4195     int vlen_enc = vector_length_encoding(this);
 4196     int vector_len = Matcher::vector_length(this);
 4197     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4198     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4199     __ lea($tmp$$Register, $mem$$Address);
 4200     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4201     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4202     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4203                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4204   %}
 4205   ins_pipe( pipe_slow );
 4206 %}
 4207 
 4208 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4209   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4210   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4211   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4212   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4213   ins_encode %{
 4214     int vlen_enc = vector_length_encoding(this);
 4215     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4216     __ lea($tmp$$Register, $mem$$Address);
 4217     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4218     if (elem_bt == T_SHORT) {
 4219       __ movl($mask_idx$$Register, 0x55555555);
 4220       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4221     }
 4222     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4223     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4224   %}
 4225   ins_pipe( pipe_slow );
 4226 %}
 4227 
 4228 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4229                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4230   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4231   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4232   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4233   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4234   ins_encode %{
 4235     int vlen_enc = vector_length_encoding(this);
 4236     int vector_len = Matcher::vector_length(this);
 4237     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4238     __ lea($tmp$$Register, $mem$$Address);
 4239     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4240     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4241     if (elem_bt == T_SHORT) {
 4242       __ movl($mask_idx$$Register, 0x55555555);
 4243       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4244     }
 4245     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4246     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4247                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4248   %}
 4249   ins_pipe( pipe_slow );
 4250 %}
 4251 
 4252 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4253   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4254   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4255   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4256   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4257   ins_encode %{
 4258     int vlen_enc = vector_length_encoding(this);
 4259     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4260     __ lea($tmp$$Register, $mem$$Address);
 4261     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4262     if (elem_bt == T_SHORT) {
 4263       __ movl($mask_idx$$Register, 0x55555555);
 4264       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4265     }
 4266     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4267     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4268                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4269   %}
 4270   ins_pipe( pipe_slow );
 4271 %}
 4272 
 4273 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4274                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4275   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4276   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4277   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4278   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4279   ins_encode %{
 4280     int vlen_enc = vector_length_encoding(this);
 4281     int vector_len = Matcher::vector_length(this);
 4282     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4283     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4284     __ lea($tmp$$Register, $mem$$Address);
 4285     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4286     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4287     if (elem_bt == T_SHORT) {
 4288       __ movl($mask_idx$$Register, 0x55555555);
 4289       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4290     }
 4291     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4292     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4293                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4294   %}
 4295   ins_pipe( pipe_slow );
 4296 %}
 4297 
 4298 // ====================Scatter=======================================
 4299 
 4300 // Scatter INT, LONG, FLOAT, DOUBLE
 4301 
 4302 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4303   predicate(UseAVX > 2);
 4304   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4305   effect(TEMP tmp, TEMP ktmp);
 4306   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4307   ins_encode %{
 4308     int vlen_enc = vector_length_encoding(this, $src);
 4309     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4310 
 4311     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4312     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4313 
 4314     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4315     __ lea($tmp$$Register, $mem$$Address);
 4316     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4317   %}
 4318   ins_pipe( pipe_slow );
 4319 %}
 4320 
 4321 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4322   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4323   effect(TEMP tmp, TEMP ktmp);
 4324   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4325   ins_encode %{
 4326     int vlen_enc = vector_length_encoding(this, $src);
 4327     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4328     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4329     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4330     // Note: Since scatter instruction partially updates the opmask register used
 4331     // for predication hense moving mask operand to a temporary.
 4332     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4333     __ lea($tmp$$Register, $mem$$Address);
 4334     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4335   %}
 4336   ins_pipe( pipe_slow );
 4337 %}
 4338 
 4339 // ====================REPLICATE=======================================
 4340 
 4341 // Replicate byte scalar to be vector
 4342 instruct vReplB_reg(vec dst, rRegI src) %{
 4343   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4344   match(Set dst (Replicate src));
 4345   format %{ "replicateB $dst,$src" %}
 4346   ins_encode %{
 4347     uint vlen = Matcher::vector_length(this);
 4348     if (UseAVX >= 2) {
 4349       int vlen_enc = vector_length_encoding(this);
 4350       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4351         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4352         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4353       } else {
 4354         __ movdl($dst$$XMMRegister, $src$$Register);
 4355         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4356       }
 4357     } else {
 4358        assert(UseAVX < 2, "");
 4359       __ movdl($dst$$XMMRegister, $src$$Register);
 4360       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4361       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4362       if (vlen >= 16) {
 4363         assert(vlen == 16, "");
 4364         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4365       }
 4366     }
 4367   %}
 4368   ins_pipe( pipe_slow );
 4369 %}
 4370 
 4371 instruct ReplB_mem(vec dst, memory mem) %{
 4372   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4373   match(Set dst (Replicate (LoadB mem)));
 4374   format %{ "replicateB $dst,$mem" %}
 4375   ins_encode %{
 4376     int vlen_enc = vector_length_encoding(this);
 4377     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4378   %}
 4379   ins_pipe( pipe_slow );
 4380 %}
 4381 
 4382 // ====================ReplicateS=======================================
 4383 
 4384 instruct vReplS_reg(vec dst, rRegI src) %{
 4385   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4386   match(Set dst (Replicate src));
 4387   format %{ "replicateS $dst,$src" %}
 4388   ins_encode %{
 4389     uint vlen = Matcher::vector_length(this);
 4390     int vlen_enc = vector_length_encoding(this);
 4391     if (UseAVX >= 2) {
 4392       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4393         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4394         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4395       } else {
 4396         __ movdl($dst$$XMMRegister, $src$$Register);
 4397         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4398       }
 4399     } else {
 4400       assert(UseAVX < 2, "");
 4401       __ movdl($dst$$XMMRegister, $src$$Register);
 4402       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4403       if (vlen >= 8) {
 4404         assert(vlen == 8, "");
 4405         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4406       }
 4407     }
 4408   %}
 4409   ins_pipe( pipe_slow );
 4410 %}
 4411 
 4412 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4413   match(Set dst (Replicate con));
 4414   effect(TEMP rtmp);
 4415   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4416   ins_encode %{
 4417     int vlen_enc = vector_length_encoding(this);
 4418     BasicType bt = Matcher::vector_element_basic_type(this);
 4419     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4420     __ movl($rtmp$$Register, $con$$constant);
 4421     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4422   %}
 4423   ins_pipe( pipe_slow );
 4424 %}
 4425 
 4426 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4427   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4428   match(Set dst (Replicate src));
 4429   effect(TEMP rtmp);
 4430   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4431   ins_encode %{
 4432     int vlen_enc = vector_length_encoding(this);
 4433     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4434     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4435   %}
 4436   ins_pipe( pipe_slow );
 4437 %}
 4438 
 4439 instruct ReplS_mem(vec dst, memory mem) %{
 4440   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4441   match(Set dst (Replicate (LoadS mem)));
 4442   format %{ "replicateS $dst,$mem" %}
 4443   ins_encode %{
 4444     int vlen_enc = vector_length_encoding(this);
 4445     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4446   %}
 4447   ins_pipe( pipe_slow );
 4448 %}
 4449 
 4450 // ====================ReplicateI=======================================
 4451 
 4452 instruct ReplI_reg(vec dst, rRegI src) %{
 4453   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4454   match(Set dst (Replicate src));
 4455   format %{ "replicateI $dst,$src" %}
 4456   ins_encode %{
 4457     uint vlen = Matcher::vector_length(this);
 4458     int vlen_enc = vector_length_encoding(this);
 4459     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4460       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4461     } else if (VM_Version::supports_avx2()) {
 4462       __ movdl($dst$$XMMRegister, $src$$Register);
 4463       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4464     } else {
 4465       __ movdl($dst$$XMMRegister, $src$$Register);
 4466       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4467     }
 4468   %}
 4469   ins_pipe( pipe_slow );
 4470 %}
 4471 
 4472 instruct ReplI_mem(vec dst, memory mem) %{
 4473   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4474   match(Set dst (Replicate (LoadI mem)));
 4475   format %{ "replicateI $dst,$mem" %}
 4476   ins_encode %{
 4477     int vlen_enc = vector_length_encoding(this);
 4478     if (VM_Version::supports_avx2()) {
 4479       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4480     } else if (VM_Version::supports_avx()) {
 4481       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4482     } else {
 4483       __ movdl($dst$$XMMRegister, $mem$$Address);
 4484       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4485     }
 4486   %}
 4487   ins_pipe( pipe_slow );
 4488 %}
 4489 
 4490 instruct ReplI_imm(vec dst, immI con) %{
 4491   predicate(Matcher::is_non_long_integral_vector(n));
 4492   match(Set dst (Replicate con));
 4493   format %{ "replicateI $dst,$con" %}
 4494   ins_encode %{
 4495     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4496                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4497                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4498     BasicType bt = Matcher::vector_element_basic_type(this);
 4499     int vlen = Matcher::vector_length_in_bytes(this);
 4500     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4501   %}
 4502   ins_pipe( pipe_slow );
 4503 %}
 4504 
 4505 // Replicate scalar zero to be vector
 4506 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4507   predicate(Matcher::is_non_long_integral_vector(n));
 4508   match(Set dst (Replicate zero));
 4509   format %{ "replicateI $dst,$zero" %}
 4510   ins_encode %{
 4511     int vlen_enc = vector_length_encoding(this);
 4512     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4513       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4514     } else {
 4515       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4516     }
 4517   %}
 4518   ins_pipe( fpu_reg_reg );
 4519 %}
 4520 
 4521 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4522   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4523   match(Set dst (Replicate con));
 4524   format %{ "vallones $dst" %}
 4525   ins_encode %{
 4526     int vector_len = vector_length_encoding(this);
 4527     __ vallones($dst$$XMMRegister, vector_len);
 4528   %}
 4529   ins_pipe( pipe_slow );
 4530 %}
 4531 
 4532 // ====================ReplicateL=======================================
 4533 
 4534 // Replicate long (8 byte) scalar to be vector
 4535 instruct ReplL_reg(vec dst, rRegL src) %{
 4536   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4537   match(Set dst (Replicate src));
 4538   format %{ "replicateL $dst,$src" %}
 4539   ins_encode %{
 4540     int vlen = Matcher::vector_length(this);
 4541     int vlen_enc = vector_length_encoding(this);
 4542     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4543       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4544     } else if (VM_Version::supports_avx2()) {
 4545       __ movdq($dst$$XMMRegister, $src$$Register);
 4546       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4547     } else {
 4548       __ movdq($dst$$XMMRegister, $src$$Register);
 4549       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4550     }
 4551   %}
 4552   ins_pipe( pipe_slow );
 4553 %}
 4554 
 4555 instruct ReplL_mem(vec dst, memory mem) %{
 4556   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4557   match(Set dst (Replicate (LoadL mem)));
 4558   format %{ "replicateL $dst,$mem" %}
 4559   ins_encode %{
 4560     int vlen_enc = vector_length_encoding(this);
 4561     if (VM_Version::supports_avx2()) {
 4562       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4563     } else if (VM_Version::supports_sse3()) {
 4564       __ movddup($dst$$XMMRegister, $mem$$Address);
 4565     } else {
 4566       __ movq($dst$$XMMRegister, $mem$$Address);
 4567       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4568     }
 4569   %}
 4570   ins_pipe( pipe_slow );
 4571 %}
 4572 
 4573 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4574 instruct ReplL_imm(vec dst, immL con) %{
 4575   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4576   match(Set dst (Replicate con));
 4577   format %{ "replicateL $dst,$con" %}
 4578   ins_encode %{
 4579     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4580     int vlen = Matcher::vector_length_in_bytes(this);
 4581     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4582   %}
 4583   ins_pipe( pipe_slow );
 4584 %}
 4585 
 4586 instruct ReplL_zero(vec dst, immL0 zero) %{
 4587   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4588   match(Set dst (Replicate zero));
 4589   format %{ "replicateL $dst,$zero" %}
 4590   ins_encode %{
 4591     int vlen_enc = vector_length_encoding(this);
 4592     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4593       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4594     } else {
 4595       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4596     }
 4597   %}
 4598   ins_pipe( fpu_reg_reg );
 4599 %}
 4600 
 4601 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4602   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4603   match(Set dst (Replicate con));
 4604   format %{ "vallones $dst" %}
 4605   ins_encode %{
 4606     int vector_len = vector_length_encoding(this);
 4607     __ vallones($dst$$XMMRegister, vector_len);
 4608   %}
 4609   ins_pipe( pipe_slow );
 4610 %}
 4611 
 4612 // ====================ReplicateF=======================================
 4613 
 4614 instruct vReplF_reg(vec dst, vlRegF src) %{
 4615   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4616   match(Set dst (Replicate src));
 4617   format %{ "replicateF $dst,$src" %}
 4618   ins_encode %{
 4619     uint vlen = Matcher::vector_length(this);
 4620     int vlen_enc = vector_length_encoding(this);
 4621     if (vlen <= 4) {
 4622       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4623     } else if (VM_Version::supports_avx2()) {
 4624       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4625     } else {
 4626       assert(vlen == 8, "sanity");
 4627       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4628       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4629     }
 4630   %}
 4631   ins_pipe( pipe_slow );
 4632 %}
 4633 
 4634 instruct ReplF_reg(vec dst, vlRegF src) %{
 4635   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4636   match(Set dst (Replicate src));
 4637   format %{ "replicateF $dst,$src" %}
 4638   ins_encode %{
 4639     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4640   %}
 4641   ins_pipe( pipe_slow );
 4642 %}
 4643 
 4644 instruct ReplF_mem(vec dst, memory mem) %{
 4645   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4646   match(Set dst (Replicate (LoadF mem)));
 4647   format %{ "replicateF $dst,$mem" %}
 4648   ins_encode %{
 4649     int vlen_enc = vector_length_encoding(this);
 4650     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4651   %}
 4652   ins_pipe( pipe_slow );
 4653 %}
 4654 
 4655 // Replicate float scalar immediate to be vector by loading from const table.
 4656 instruct ReplF_imm(vec dst, immF con) %{
 4657   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4658   match(Set dst (Replicate con));
 4659   format %{ "replicateF $dst,$con" %}
 4660   ins_encode %{
 4661     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4662                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4663     int vlen = Matcher::vector_length_in_bytes(this);
 4664     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4665   %}
 4666   ins_pipe( pipe_slow );
 4667 %}
 4668 
 4669 instruct ReplF_zero(vec dst, immF0 zero) %{
 4670   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4671   match(Set dst (Replicate zero));
 4672   format %{ "replicateF $dst,$zero" %}
 4673   ins_encode %{
 4674     int vlen_enc = vector_length_encoding(this);
 4675     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4676       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4677     } else {
 4678       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4679     }
 4680   %}
 4681   ins_pipe( fpu_reg_reg );
 4682 %}
 4683 
 4684 // ====================ReplicateD=======================================
 4685 
 4686 // Replicate double (8 bytes) scalar to be vector
 4687 instruct vReplD_reg(vec dst, vlRegD src) %{
 4688   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4689   match(Set dst (Replicate src));
 4690   format %{ "replicateD $dst,$src" %}
 4691   ins_encode %{
 4692     uint vlen = Matcher::vector_length(this);
 4693     int vlen_enc = vector_length_encoding(this);
 4694     if (vlen <= 2) {
 4695       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4696     } else if (VM_Version::supports_avx2()) {
 4697       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4698     } else {
 4699       assert(vlen == 4, "sanity");
 4700       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4701       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4702     }
 4703   %}
 4704   ins_pipe( pipe_slow );
 4705 %}
 4706 
 4707 instruct ReplD_reg(vec dst, vlRegD src) %{
 4708   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4709   match(Set dst (Replicate src));
 4710   format %{ "replicateD $dst,$src" %}
 4711   ins_encode %{
 4712     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4713   %}
 4714   ins_pipe( pipe_slow );
 4715 %}
 4716 
 4717 instruct ReplD_mem(vec dst, memory mem) %{
 4718   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4719   match(Set dst (Replicate (LoadD mem)));
 4720   format %{ "replicateD $dst,$mem" %}
 4721   ins_encode %{
 4722     if (Matcher::vector_length(this) >= 4) {
 4723       int vlen_enc = vector_length_encoding(this);
 4724       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4725     } else {
 4726       __ movddup($dst$$XMMRegister, $mem$$Address);
 4727     }
 4728   %}
 4729   ins_pipe( pipe_slow );
 4730 %}
 4731 
 4732 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4733 instruct ReplD_imm(vec dst, immD con) %{
 4734   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4735   match(Set dst (Replicate con));
 4736   format %{ "replicateD $dst,$con" %}
 4737   ins_encode %{
 4738     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4739     int vlen = Matcher::vector_length_in_bytes(this);
 4740     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4741   %}
 4742   ins_pipe( pipe_slow );
 4743 %}
 4744 
 4745 instruct ReplD_zero(vec dst, immD0 zero) %{
 4746   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4747   match(Set dst (Replicate zero));
 4748   format %{ "replicateD $dst,$zero" %}
 4749   ins_encode %{
 4750     int vlen_enc = vector_length_encoding(this);
 4751     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4752       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4753     } else {
 4754       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4755     }
 4756   %}
 4757   ins_pipe( fpu_reg_reg );
 4758 %}
 4759 
 4760 // ====================VECTOR INSERT=======================================
 4761 
 4762 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4763   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4764   match(Set dst (VectorInsert (Binary dst val) idx));
 4765   format %{ "vector_insert $dst,$val,$idx" %}
 4766   ins_encode %{
 4767     assert(UseSSE >= 4, "required");
 4768     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4769 
 4770     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4771 
 4772     assert(is_integral_type(elem_bt), "");
 4773     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4774 
 4775     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4776   %}
 4777   ins_pipe( pipe_slow );
 4778 %}
 4779 
 4780 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4781   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4782   match(Set dst (VectorInsert (Binary src val) idx));
 4783   effect(TEMP vtmp);
 4784   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4785   ins_encode %{
 4786     int vlen_enc = Assembler::AVX_256bit;
 4787     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4788     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4789     int log2epr = log2(elem_per_lane);
 4790 
 4791     assert(is_integral_type(elem_bt), "sanity");
 4792     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4793 
 4794     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4795     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4796     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4797     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4798     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4799   %}
 4800   ins_pipe( pipe_slow );
 4801 %}
 4802 
 4803 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4804   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4805   match(Set dst (VectorInsert (Binary src val) idx));
 4806   effect(TEMP vtmp);
 4807   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4808   ins_encode %{
 4809     assert(UseAVX > 2, "sanity");
 4810 
 4811     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4812     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4813     int log2epr = log2(elem_per_lane);
 4814 
 4815     assert(is_integral_type(elem_bt), "");
 4816     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4817 
 4818     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4819     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4820     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4821     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4822     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4823   %}
 4824   ins_pipe( pipe_slow );
 4825 %}
 4826 
 4827 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4828   predicate(Matcher::vector_length(n) == 2);
 4829   match(Set dst (VectorInsert (Binary dst val) idx));
 4830   format %{ "vector_insert $dst,$val,$idx" %}
 4831   ins_encode %{
 4832     assert(UseSSE >= 4, "required");
 4833     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4834     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4835 
 4836     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4837   %}
 4838   ins_pipe( pipe_slow );
 4839 %}
 4840 
 4841 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4842   predicate(Matcher::vector_length(n) == 4);
 4843   match(Set dst (VectorInsert (Binary src val) idx));
 4844   effect(TEMP vtmp);
 4845   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4846   ins_encode %{
 4847     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4848     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4849 
 4850     uint x_idx = $idx$$constant & right_n_bits(1);
 4851     uint y_idx = ($idx$$constant >> 1) & 1;
 4852     int vlen_enc = Assembler::AVX_256bit;
 4853     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4854     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4855     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4856   %}
 4857   ins_pipe( pipe_slow );
 4858 %}
 4859 
 4860 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4861   predicate(Matcher::vector_length(n) == 8);
 4862   match(Set dst (VectorInsert (Binary src val) idx));
 4863   effect(TEMP vtmp);
 4864   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4865   ins_encode %{
 4866     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4867     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4868 
 4869     uint x_idx = $idx$$constant & right_n_bits(1);
 4870     uint y_idx = ($idx$$constant >> 1) & 3;
 4871     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4872     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4873     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4874   %}
 4875   ins_pipe( pipe_slow );
 4876 %}
 4877 
 4878 instruct insertF(vec dst, regF val, immU8 idx) %{
 4879   predicate(Matcher::vector_length(n) < 8);
 4880   match(Set dst (VectorInsert (Binary dst val) idx));
 4881   format %{ "vector_insert $dst,$val,$idx" %}
 4882   ins_encode %{
 4883     assert(UseSSE >= 4, "sanity");
 4884 
 4885     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4886     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4887 
 4888     uint x_idx = $idx$$constant & right_n_bits(2);
 4889     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4890   %}
 4891   ins_pipe( pipe_slow );
 4892 %}
 4893 
 4894 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4895   predicate(Matcher::vector_length(n) >= 8);
 4896   match(Set dst (VectorInsert (Binary src val) idx));
 4897   effect(TEMP vtmp);
 4898   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4899   ins_encode %{
 4900     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4901     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4902 
 4903     int vlen = Matcher::vector_length(this);
 4904     uint x_idx = $idx$$constant & right_n_bits(2);
 4905     if (vlen == 8) {
 4906       uint y_idx = ($idx$$constant >> 2) & 1;
 4907       int vlen_enc = Assembler::AVX_256bit;
 4908       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4909       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4910       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4911     } else {
 4912       assert(vlen == 16, "sanity");
 4913       uint y_idx = ($idx$$constant >> 2) & 3;
 4914       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4915       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4916       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4917     }
 4918   %}
 4919   ins_pipe( pipe_slow );
 4920 %}
 4921 
 4922 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4923   predicate(Matcher::vector_length(n) == 2);
 4924   match(Set dst (VectorInsert (Binary dst val) idx));
 4925   effect(TEMP tmp);
 4926   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4927   ins_encode %{
 4928     assert(UseSSE >= 4, "sanity");
 4929     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4930     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4931 
 4932     __ movq($tmp$$Register, $val$$XMMRegister);
 4933     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4934   %}
 4935   ins_pipe( pipe_slow );
 4936 %}
 4937 
 4938 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4939   predicate(Matcher::vector_length(n) == 4);
 4940   match(Set dst (VectorInsert (Binary src val) idx));
 4941   effect(TEMP vtmp, TEMP tmp);
 4942   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4943   ins_encode %{
 4944     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4945     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4946 
 4947     uint x_idx = $idx$$constant & right_n_bits(1);
 4948     uint y_idx = ($idx$$constant >> 1) & 1;
 4949     int vlen_enc = Assembler::AVX_256bit;
 4950     __ movq($tmp$$Register, $val$$XMMRegister);
 4951     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4952     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4953     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4954   %}
 4955   ins_pipe( pipe_slow );
 4956 %}
 4957 
 4958 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4959   predicate(Matcher::vector_length(n) == 8);
 4960   match(Set dst (VectorInsert (Binary src val) idx));
 4961   effect(TEMP tmp, TEMP vtmp);
 4962   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4963   ins_encode %{
 4964     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4965     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4966 
 4967     uint x_idx = $idx$$constant & right_n_bits(1);
 4968     uint y_idx = ($idx$$constant >> 1) & 3;
 4969     __ movq($tmp$$Register, $val$$XMMRegister);
 4970     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4971     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4972     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4973   %}
 4974   ins_pipe( pipe_slow );
 4975 %}
 4976 
 4977 // ====================REDUCTION ARITHMETIC=======================================
 4978 
 4979 // =======================Int Reduction==========================================
 4980 
 4981 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4982   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4983   match(Set dst (AddReductionVI src1 src2));
 4984   match(Set dst (MulReductionVI src1 src2));
 4985   match(Set dst (AndReductionV  src1 src2));
 4986   match(Set dst ( OrReductionV  src1 src2));
 4987   match(Set dst (XorReductionV  src1 src2));
 4988   match(Set dst (MinReductionV  src1 src2));
 4989   match(Set dst (MaxReductionV  src1 src2));
 4990   effect(TEMP vtmp1, TEMP vtmp2);
 4991   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4992   ins_encode %{
 4993     int opcode = this->ideal_Opcode();
 4994     int vlen = Matcher::vector_length(this, $src2);
 4995     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4996   %}
 4997   ins_pipe( pipe_slow );
 4998 %}
 4999 
 5000 // =======================Long Reduction==========================================
 5001 
 5002 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5003   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 5004   match(Set dst (AddReductionVL src1 src2));
 5005   match(Set dst (MulReductionVL src1 src2));
 5006   match(Set dst (AndReductionV  src1 src2));
 5007   match(Set dst ( OrReductionV  src1 src2));
 5008   match(Set dst (XorReductionV  src1 src2));
 5009   match(Set dst (MinReductionV  src1 src2));
 5010   match(Set dst (MaxReductionV  src1 src2));
 5011   effect(TEMP vtmp1, TEMP vtmp2);
 5012   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5013   ins_encode %{
 5014     int opcode = this->ideal_Opcode();
 5015     int vlen = Matcher::vector_length(this, $src2);
 5016     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5017   %}
 5018   ins_pipe( pipe_slow );
 5019 %}
 5020 
 5021 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5022   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5023   match(Set dst (AddReductionVL src1 src2));
 5024   match(Set dst (MulReductionVL src1 src2));
 5025   match(Set dst (AndReductionV  src1 src2));
 5026   match(Set dst ( OrReductionV  src1 src2));
 5027   match(Set dst (XorReductionV  src1 src2));
 5028   match(Set dst (MinReductionV  src1 src2));
 5029   match(Set dst (MaxReductionV  src1 src2));
 5030   effect(TEMP vtmp1, TEMP vtmp2);
 5031   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5032   ins_encode %{
 5033     int opcode = this->ideal_Opcode();
 5034     int vlen = Matcher::vector_length(this, $src2);
 5035     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5036   %}
 5037   ins_pipe( pipe_slow );
 5038 %}
 5039 
 5040 // =======================Float Reduction==========================================
 5041 
 5042 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5043   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5044   match(Set dst (AddReductionVF dst src));
 5045   match(Set dst (MulReductionVF dst src));
 5046   effect(TEMP dst, TEMP vtmp);
 5047   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5048   ins_encode %{
 5049     int opcode = this->ideal_Opcode();
 5050     int vlen = Matcher::vector_length(this, $src);
 5051     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5052   %}
 5053   ins_pipe( pipe_slow );
 5054 %}
 5055 
 5056 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5057   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5058   match(Set dst (AddReductionVF dst src));
 5059   match(Set dst (MulReductionVF dst src));
 5060   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5061   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5062   ins_encode %{
 5063     int opcode = this->ideal_Opcode();
 5064     int vlen = Matcher::vector_length(this, $src);
 5065     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5066   %}
 5067   ins_pipe( pipe_slow );
 5068 %}
 5069 
 5070 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5071   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5072   match(Set dst (AddReductionVF dst src));
 5073   match(Set dst (MulReductionVF dst src));
 5074   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5075   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5076   ins_encode %{
 5077     int opcode = this->ideal_Opcode();
 5078     int vlen = Matcher::vector_length(this, $src);
 5079     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5080   %}
 5081   ins_pipe( pipe_slow );
 5082 %}
 5083 
 5084 
 5085 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5086   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5087   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5088   // src1 contains reduction identity
 5089   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5090   match(Set dst (AddReductionVF src1 src2));
 5091   match(Set dst (MulReductionVF src1 src2));
 5092   effect(TEMP dst);
 5093   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5094   ins_encode %{
 5095     int opcode = this->ideal_Opcode();
 5096     int vlen = Matcher::vector_length(this, $src2);
 5097     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5098   %}
 5099   ins_pipe( pipe_slow );
 5100 %}
 5101 
 5102 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5103   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5104   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5105   // src1 contains reduction identity
 5106   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5107   match(Set dst (AddReductionVF src1 src2));
 5108   match(Set dst (MulReductionVF src1 src2));
 5109   effect(TEMP dst, TEMP vtmp);
 5110   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5111   ins_encode %{
 5112     int opcode = this->ideal_Opcode();
 5113     int vlen = Matcher::vector_length(this, $src2);
 5114     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5115   %}
 5116   ins_pipe( pipe_slow );
 5117 %}
 5118 
 5119 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5120   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5121   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5122   // src1 contains reduction identity
 5123   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5124   match(Set dst (AddReductionVF src1 src2));
 5125   match(Set dst (MulReductionVF src1 src2));
 5126   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5127   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5128   ins_encode %{
 5129     int opcode = this->ideal_Opcode();
 5130     int vlen = Matcher::vector_length(this, $src2);
 5131     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5132   %}
 5133   ins_pipe( pipe_slow );
 5134 %}
 5135 
 5136 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5137   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5138   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5139   // src1 contains reduction identity
 5140   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5141   match(Set dst (AddReductionVF src1 src2));
 5142   match(Set dst (MulReductionVF src1 src2));
 5143   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5144   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5145   ins_encode %{
 5146     int opcode = this->ideal_Opcode();
 5147     int vlen = Matcher::vector_length(this, $src2);
 5148     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5149   %}
 5150   ins_pipe( pipe_slow );
 5151 %}
 5152 
 5153 // =======================Double Reduction==========================================
 5154 
 5155 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5156   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5157   match(Set dst (AddReductionVD dst src));
 5158   match(Set dst (MulReductionVD dst src));
 5159   effect(TEMP dst, TEMP vtmp);
 5160   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5161   ins_encode %{
 5162     int opcode = this->ideal_Opcode();
 5163     int vlen = Matcher::vector_length(this, $src);
 5164     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5165 %}
 5166   ins_pipe( pipe_slow );
 5167 %}
 5168 
 5169 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5170   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5171   match(Set dst (AddReductionVD dst src));
 5172   match(Set dst (MulReductionVD dst src));
 5173   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5174   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5175   ins_encode %{
 5176     int opcode = this->ideal_Opcode();
 5177     int vlen = Matcher::vector_length(this, $src);
 5178     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5179   %}
 5180   ins_pipe( pipe_slow );
 5181 %}
 5182 
 5183 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5184   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5185   match(Set dst (AddReductionVD dst src));
 5186   match(Set dst (MulReductionVD dst src));
 5187   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5188   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5189   ins_encode %{
 5190     int opcode = this->ideal_Opcode();
 5191     int vlen = Matcher::vector_length(this, $src);
 5192     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5193   %}
 5194   ins_pipe( pipe_slow );
 5195 %}
 5196 
 5197 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5198   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5199   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5200   // src1 contains reduction identity
 5201   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5202   match(Set dst (AddReductionVD src1 src2));
 5203   match(Set dst (MulReductionVD src1 src2));
 5204   effect(TEMP dst);
 5205   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5206   ins_encode %{
 5207     int opcode = this->ideal_Opcode();
 5208     int vlen = Matcher::vector_length(this, $src2);
 5209     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5210 %}
 5211   ins_pipe( pipe_slow );
 5212 %}
 5213 
 5214 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5215   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5216   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5217   // src1 contains reduction identity
 5218   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5219   match(Set dst (AddReductionVD src1 src2));
 5220   match(Set dst (MulReductionVD src1 src2));
 5221   effect(TEMP dst, TEMP vtmp);
 5222   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5223   ins_encode %{
 5224     int opcode = this->ideal_Opcode();
 5225     int vlen = Matcher::vector_length(this, $src2);
 5226     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5227   %}
 5228   ins_pipe( pipe_slow );
 5229 %}
 5230 
 5231 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5232   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5233   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5234   // src1 contains reduction identity
 5235   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5236   match(Set dst (AddReductionVD src1 src2));
 5237   match(Set dst (MulReductionVD src1 src2));
 5238   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5239   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5240   ins_encode %{
 5241     int opcode = this->ideal_Opcode();
 5242     int vlen = Matcher::vector_length(this, $src2);
 5243     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5244   %}
 5245   ins_pipe( pipe_slow );
 5246 %}
 5247 
 5248 // =======================Byte Reduction==========================================
 5249 
 5250 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5251   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5252   match(Set dst (AddReductionVI src1 src2));
 5253   match(Set dst (AndReductionV  src1 src2));
 5254   match(Set dst ( OrReductionV  src1 src2));
 5255   match(Set dst (XorReductionV  src1 src2));
 5256   match(Set dst (MinReductionV  src1 src2));
 5257   match(Set dst (MaxReductionV  src1 src2));
 5258   effect(TEMP vtmp1, TEMP vtmp2);
 5259   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5260   ins_encode %{
 5261     int opcode = this->ideal_Opcode();
 5262     int vlen = Matcher::vector_length(this, $src2);
 5263     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5264   %}
 5265   ins_pipe( pipe_slow );
 5266 %}
 5267 
 5268 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5269   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5270   match(Set dst (AddReductionVI src1 src2));
 5271   match(Set dst (AndReductionV  src1 src2));
 5272   match(Set dst ( OrReductionV  src1 src2));
 5273   match(Set dst (XorReductionV  src1 src2));
 5274   match(Set dst (MinReductionV  src1 src2));
 5275   match(Set dst (MaxReductionV  src1 src2));
 5276   effect(TEMP vtmp1, TEMP vtmp2);
 5277   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5278   ins_encode %{
 5279     int opcode = this->ideal_Opcode();
 5280     int vlen = Matcher::vector_length(this, $src2);
 5281     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5282   %}
 5283   ins_pipe( pipe_slow );
 5284 %}
 5285 
 5286 // =======================Short Reduction==========================================
 5287 
 5288 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5289   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5290   match(Set dst (AddReductionVI src1 src2));
 5291   match(Set dst (MulReductionVI src1 src2));
 5292   match(Set dst (AndReductionV  src1 src2));
 5293   match(Set dst ( OrReductionV  src1 src2));
 5294   match(Set dst (XorReductionV  src1 src2));
 5295   match(Set dst (MinReductionV  src1 src2));
 5296   match(Set dst (MaxReductionV  src1 src2));
 5297   effect(TEMP vtmp1, TEMP vtmp2);
 5298   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5299   ins_encode %{
 5300     int opcode = this->ideal_Opcode();
 5301     int vlen = Matcher::vector_length(this, $src2);
 5302     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5303   %}
 5304   ins_pipe( pipe_slow );
 5305 %}
 5306 
 5307 // =======================Mul Reduction==========================================
 5308 
 5309 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5310   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5311             Matcher::vector_length(n->in(2)) <= 32); // src2
 5312   match(Set dst (MulReductionVI src1 src2));
 5313   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5314   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5315   ins_encode %{
 5316     int opcode = this->ideal_Opcode();
 5317     int vlen = Matcher::vector_length(this, $src2);
 5318     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5319   %}
 5320   ins_pipe( pipe_slow );
 5321 %}
 5322 
 5323 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5324   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5325             Matcher::vector_length(n->in(2)) == 64); // src2
 5326   match(Set dst (MulReductionVI src1 src2));
 5327   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5328   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5329   ins_encode %{
 5330     int opcode = this->ideal_Opcode();
 5331     int vlen = Matcher::vector_length(this, $src2);
 5332     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5333   %}
 5334   ins_pipe( pipe_slow );
 5335 %}
 5336 
 5337 //--------------------Min/Max Float Reduction --------------------
 5338 // Float Min Reduction
 5339 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5340                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5341   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5342             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5343              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5344             Matcher::vector_length(n->in(2)) == 2);
 5345   match(Set dst (MinReductionV src1 src2));
 5346   match(Set dst (MaxReductionV src1 src2));
 5347   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5348   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5349   ins_encode %{
 5350     assert(UseAVX > 0, "sanity");
 5351 
 5352     int opcode = this->ideal_Opcode();
 5353     int vlen = Matcher::vector_length(this, $src2);
 5354     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5355                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5356   %}
 5357   ins_pipe( pipe_slow );
 5358 %}
 5359 
 5360 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5361                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5362   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5363             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5364              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5365             Matcher::vector_length(n->in(2)) >= 4);
 5366   match(Set dst (MinReductionV src1 src2));
 5367   match(Set dst (MaxReductionV src1 src2));
 5368   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5369   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5370   ins_encode %{
 5371     assert(UseAVX > 0, "sanity");
 5372 
 5373     int opcode = this->ideal_Opcode();
 5374     int vlen = Matcher::vector_length(this, $src2);
 5375     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5376                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5377   %}
 5378   ins_pipe( pipe_slow );
 5379 %}
 5380 
 5381 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5382                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5383   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5384             Matcher::vector_length(n->in(2)) == 2);
 5385   match(Set dst (MinReductionV dst src));
 5386   match(Set dst (MaxReductionV dst src));
 5387   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5388   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5389   ins_encode %{
 5390     assert(UseAVX > 0, "sanity");
 5391 
 5392     int opcode = this->ideal_Opcode();
 5393     int vlen = Matcher::vector_length(this, $src);
 5394     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5395                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5396   %}
 5397   ins_pipe( pipe_slow );
 5398 %}
 5399 
 5400 
 5401 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5402                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5403   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5404             Matcher::vector_length(n->in(2)) >= 4);
 5405   match(Set dst (MinReductionV dst src));
 5406   match(Set dst (MaxReductionV dst src));
 5407   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5408   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5409   ins_encode %{
 5410     assert(UseAVX > 0, "sanity");
 5411 
 5412     int opcode = this->ideal_Opcode();
 5413     int vlen = Matcher::vector_length(this, $src);
 5414     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5415                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5416   %}
 5417   ins_pipe( pipe_slow );
 5418 %}
 5419 
 5420 
 5421 //--------------------Min Double Reduction --------------------
 5422 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5423                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5424                             rFlagsReg cr) %{
 5425   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5426             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5427              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5428             Matcher::vector_length(n->in(2)) == 2);
 5429   match(Set dst (MinReductionV src1 src2));
 5430   match(Set dst (MaxReductionV src1 src2));
 5431   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5432   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5433   ins_encode %{
 5434     assert(UseAVX > 0, "sanity");
 5435 
 5436     int opcode = this->ideal_Opcode();
 5437     int vlen = Matcher::vector_length(this, $src2);
 5438     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5439                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5440   %}
 5441   ins_pipe( pipe_slow );
 5442 %}
 5443 
 5444 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5445                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5446                            rFlagsReg cr) %{
 5447   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5448             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5449              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5450             Matcher::vector_length(n->in(2)) >= 4);
 5451   match(Set dst (MinReductionV src1 src2));
 5452   match(Set dst (MaxReductionV src1 src2));
 5453   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5454   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5455   ins_encode %{
 5456     assert(UseAVX > 0, "sanity");
 5457 
 5458     int opcode = this->ideal_Opcode();
 5459     int vlen = Matcher::vector_length(this, $src2);
 5460     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5461                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5462   %}
 5463   ins_pipe( pipe_slow );
 5464 %}
 5465 
 5466 
 5467 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5468                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5469                                rFlagsReg cr) %{
 5470   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5471             Matcher::vector_length(n->in(2)) == 2);
 5472   match(Set dst (MinReductionV dst src));
 5473   match(Set dst (MaxReductionV dst src));
 5474   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5475   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5476   ins_encode %{
 5477     assert(UseAVX > 0, "sanity");
 5478 
 5479     int opcode = this->ideal_Opcode();
 5480     int vlen = Matcher::vector_length(this, $src);
 5481     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5482                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5483   %}
 5484   ins_pipe( pipe_slow );
 5485 %}
 5486 
 5487 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5488                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5489                               rFlagsReg cr) %{
 5490   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5491             Matcher::vector_length(n->in(2)) >= 4);
 5492   match(Set dst (MinReductionV dst src));
 5493   match(Set dst (MaxReductionV dst src));
 5494   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5495   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5496   ins_encode %{
 5497     assert(UseAVX > 0, "sanity");
 5498 
 5499     int opcode = this->ideal_Opcode();
 5500     int vlen = Matcher::vector_length(this, $src);
 5501     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5502                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5503   %}
 5504   ins_pipe( pipe_slow );
 5505 %}
 5506 
 5507 // ====================VECTOR ARITHMETIC=======================================
 5508 
 5509 // --------------------------------- ADD --------------------------------------
 5510 
 5511 // Bytes vector add
 5512 instruct vaddB(vec dst, vec src) %{
 5513   predicate(UseAVX == 0);
 5514   match(Set dst (AddVB dst src));
 5515   format %{ "paddb   $dst,$src\t! add packedB" %}
 5516   ins_encode %{
 5517     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5518   %}
 5519   ins_pipe( pipe_slow );
 5520 %}
 5521 
 5522 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5523   predicate(UseAVX > 0);
 5524   match(Set dst (AddVB src1 src2));
 5525   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5526   ins_encode %{
 5527     int vlen_enc = vector_length_encoding(this);
 5528     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5529   %}
 5530   ins_pipe( pipe_slow );
 5531 %}
 5532 
 5533 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5534   predicate((UseAVX > 0) &&
 5535             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5536   match(Set dst (AddVB src (LoadVector mem)));
 5537   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5538   ins_encode %{
 5539     int vlen_enc = vector_length_encoding(this);
 5540     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5541   %}
 5542   ins_pipe( pipe_slow );
 5543 %}
 5544 
 5545 // Shorts/Chars vector add
 5546 instruct vaddS(vec dst, vec src) %{
 5547   predicate(UseAVX == 0);
 5548   match(Set dst (AddVS dst src));
 5549   format %{ "paddw   $dst,$src\t! add packedS" %}
 5550   ins_encode %{
 5551     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5552   %}
 5553   ins_pipe( pipe_slow );
 5554 %}
 5555 
 5556 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5557   predicate(UseAVX > 0);
 5558   match(Set dst (AddVS src1 src2));
 5559   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5560   ins_encode %{
 5561     int vlen_enc = vector_length_encoding(this);
 5562     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5563   %}
 5564   ins_pipe( pipe_slow );
 5565 %}
 5566 
 5567 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5568   predicate((UseAVX > 0) &&
 5569             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5570   match(Set dst (AddVS src (LoadVector mem)));
 5571   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5572   ins_encode %{
 5573     int vlen_enc = vector_length_encoding(this);
 5574     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5575   %}
 5576   ins_pipe( pipe_slow );
 5577 %}
 5578 
 5579 // Integers vector add
 5580 instruct vaddI(vec dst, vec src) %{
 5581   predicate(UseAVX == 0);
 5582   match(Set dst (AddVI dst src));
 5583   format %{ "paddd   $dst,$src\t! add packedI" %}
 5584   ins_encode %{
 5585     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5586   %}
 5587   ins_pipe( pipe_slow );
 5588 %}
 5589 
 5590 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5591   predicate(UseAVX > 0);
 5592   match(Set dst (AddVI src1 src2));
 5593   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5594   ins_encode %{
 5595     int vlen_enc = vector_length_encoding(this);
 5596     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5597   %}
 5598   ins_pipe( pipe_slow );
 5599 %}
 5600 
 5601 
 5602 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5603   predicate((UseAVX > 0) &&
 5604             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5605   match(Set dst (AddVI src (LoadVector mem)));
 5606   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5607   ins_encode %{
 5608     int vlen_enc = vector_length_encoding(this);
 5609     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5610   %}
 5611   ins_pipe( pipe_slow );
 5612 %}
 5613 
 5614 // Longs vector add
 5615 instruct vaddL(vec dst, vec src) %{
 5616   predicate(UseAVX == 0);
 5617   match(Set dst (AddVL dst src));
 5618   format %{ "paddq   $dst,$src\t! add packedL" %}
 5619   ins_encode %{
 5620     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5621   %}
 5622   ins_pipe( pipe_slow );
 5623 %}
 5624 
 5625 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5626   predicate(UseAVX > 0);
 5627   match(Set dst (AddVL src1 src2));
 5628   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5629   ins_encode %{
 5630     int vlen_enc = vector_length_encoding(this);
 5631     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5632   %}
 5633   ins_pipe( pipe_slow );
 5634 %}
 5635 
 5636 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5637   predicate((UseAVX > 0) &&
 5638             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5639   match(Set dst (AddVL src (LoadVector mem)));
 5640   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5641   ins_encode %{
 5642     int vlen_enc = vector_length_encoding(this);
 5643     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5644   %}
 5645   ins_pipe( pipe_slow );
 5646 %}
 5647 
 5648 // Floats vector add
 5649 instruct vaddF(vec dst, vec src) %{
 5650   predicate(UseAVX == 0);
 5651   match(Set dst (AddVF dst src));
 5652   format %{ "addps   $dst,$src\t! add packedF" %}
 5653   ins_encode %{
 5654     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5655   %}
 5656   ins_pipe( pipe_slow );
 5657 %}
 5658 
 5659 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5660   predicate(UseAVX > 0);
 5661   match(Set dst (AddVF src1 src2));
 5662   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5663   ins_encode %{
 5664     int vlen_enc = vector_length_encoding(this);
 5665     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5666   %}
 5667   ins_pipe( pipe_slow );
 5668 %}
 5669 
 5670 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5671   predicate((UseAVX > 0) &&
 5672             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5673   match(Set dst (AddVF src (LoadVector mem)));
 5674   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5675   ins_encode %{
 5676     int vlen_enc = vector_length_encoding(this);
 5677     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5678   %}
 5679   ins_pipe( pipe_slow );
 5680 %}
 5681 
 5682 // Doubles vector add
 5683 instruct vaddD(vec dst, vec src) %{
 5684   predicate(UseAVX == 0);
 5685   match(Set dst (AddVD dst src));
 5686   format %{ "addpd   $dst,$src\t! add packedD" %}
 5687   ins_encode %{
 5688     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5689   %}
 5690   ins_pipe( pipe_slow );
 5691 %}
 5692 
 5693 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5694   predicate(UseAVX > 0);
 5695   match(Set dst (AddVD src1 src2));
 5696   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5697   ins_encode %{
 5698     int vlen_enc = vector_length_encoding(this);
 5699     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5700   %}
 5701   ins_pipe( pipe_slow );
 5702 %}
 5703 
 5704 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5705   predicate((UseAVX > 0) &&
 5706             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5707   match(Set dst (AddVD src (LoadVector mem)));
 5708   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5709   ins_encode %{
 5710     int vlen_enc = vector_length_encoding(this);
 5711     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5712   %}
 5713   ins_pipe( pipe_slow );
 5714 %}
 5715 
 5716 // --------------------------------- SUB --------------------------------------
 5717 
 5718 // Bytes vector sub
 5719 instruct vsubB(vec dst, vec src) %{
 5720   predicate(UseAVX == 0);
 5721   match(Set dst (SubVB dst src));
 5722   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5723   ins_encode %{
 5724     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5725   %}
 5726   ins_pipe( pipe_slow );
 5727 %}
 5728 
 5729 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5730   predicate(UseAVX > 0);
 5731   match(Set dst (SubVB src1 src2));
 5732   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5733   ins_encode %{
 5734     int vlen_enc = vector_length_encoding(this);
 5735     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5736   %}
 5737   ins_pipe( pipe_slow );
 5738 %}
 5739 
 5740 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5741   predicate((UseAVX > 0) &&
 5742             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5743   match(Set dst (SubVB src (LoadVector mem)));
 5744   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5745   ins_encode %{
 5746     int vlen_enc = vector_length_encoding(this);
 5747     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5748   %}
 5749   ins_pipe( pipe_slow );
 5750 %}
 5751 
 5752 // Shorts/Chars vector sub
 5753 instruct vsubS(vec dst, vec src) %{
 5754   predicate(UseAVX == 0);
 5755   match(Set dst (SubVS dst src));
 5756   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5757   ins_encode %{
 5758     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5759   %}
 5760   ins_pipe( pipe_slow );
 5761 %}
 5762 
 5763 
 5764 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5765   predicate(UseAVX > 0);
 5766   match(Set dst (SubVS src1 src2));
 5767   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5768   ins_encode %{
 5769     int vlen_enc = vector_length_encoding(this);
 5770     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5771   %}
 5772   ins_pipe( pipe_slow );
 5773 %}
 5774 
 5775 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5776   predicate((UseAVX > 0) &&
 5777             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5778   match(Set dst (SubVS src (LoadVector mem)));
 5779   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5780   ins_encode %{
 5781     int vlen_enc = vector_length_encoding(this);
 5782     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5783   %}
 5784   ins_pipe( pipe_slow );
 5785 %}
 5786 
 5787 // Integers vector sub
 5788 instruct vsubI(vec dst, vec src) %{
 5789   predicate(UseAVX == 0);
 5790   match(Set dst (SubVI dst src));
 5791   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5792   ins_encode %{
 5793     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5794   %}
 5795   ins_pipe( pipe_slow );
 5796 %}
 5797 
 5798 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5799   predicate(UseAVX > 0);
 5800   match(Set dst (SubVI src1 src2));
 5801   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5802   ins_encode %{
 5803     int vlen_enc = vector_length_encoding(this);
 5804     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5805   %}
 5806   ins_pipe( pipe_slow );
 5807 %}
 5808 
 5809 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5810   predicate((UseAVX > 0) &&
 5811             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5812   match(Set dst (SubVI src (LoadVector mem)));
 5813   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5814   ins_encode %{
 5815     int vlen_enc = vector_length_encoding(this);
 5816     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5817   %}
 5818   ins_pipe( pipe_slow );
 5819 %}
 5820 
 5821 // Longs vector sub
 5822 instruct vsubL(vec dst, vec src) %{
 5823   predicate(UseAVX == 0);
 5824   match(Set dst (SubVL dst src));
 5825   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5826   ins_encode %{
 5827     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5828   %}
 5829   ins_pipe( pipe_slow );
 5830 %}
 5831 
 5832 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5833   predicate(UseAVX > 0);
 5834   match(Set dst (SubVL src1 src2));
 5835   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5836   ins_encode %{
 5837     int vlen_enc = vector_length_encoding(this);
 5838     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5839   %}
 5840   ins_pipe( pipe_slow );
 5841 %}
 5842 
 5843 
 5844 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5845   predicate((UseAVX > 0) &&
 5846             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5847   match(Set dst (SubVL src (LoadVector mem)));
 5848   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5849   ins_encode %{
 5850     int vlen_enc = vector_length_encoding(this);
 5851     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5852   %}
 5853   ins_pipe( pipe_slow );
 5854 %}
 5855 
 5856 // Floats vector sub
 5857 instruct vsubF(vec dst, vec src) %{
 5858   predicate(UseAVX == 0);
 5859   match(Set dst (SubVF dst src));
 5860   format %{ "subps   $dst,$src\t! sub packedF" %}
 5861   ins_encode %{
 5862     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5863   %}
 5864   ins_pipe( pipe_slow );
 5865 %}
 5866 
 5867 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5868   predicate(UseAVX > 0);
 5869   match(Set dst (SubVF src1 src2));
 5870   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5871   ins_encode %{
 5872     int vlen_enc = vector_length_encoding(this);
 5873     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5874   %}
 5875   ins_pipe( pipe_slow );
 5876 %}
 5877 
 5878 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5879   predicate((UseAVX > 0) &&
 5880             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5881   match(Set dst (SubVF src (LoadVector mem)));
 5882   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5883   ins_encode %{
 5884     int vlen_enc = vector_length_encoding(this);
 5885     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5886   %}
 5887   ins_pipe( pipe_slow );
 5888 %}
 5889 
 5890 // Doubles vector sub
 5891 instruct vsubD(vec dst, vec src) %{
 5892   predicate(UseAVX == 0);
 5893   match(Set dst (SubVD dst src));
 5894   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5895   ins_encode %{
 5896     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5897   %}
 5898   ins_pipe( pipe_slow );
 5899 %}
 5900 
 5901 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5902   predicate(UseAVX > 0);
 5903   match(Set dst (SubVD src1 src2));
 5904   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5905   ins_encode %{
 5906     int vlen_enc = vector_length_encoding(this);
 5907     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5908   %}
 5909   ins_pipe( pipe_slow );
 5910 %}
 5911 
 5912 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5913   predicate((UseAVX > 0) &&
 5914             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5915   match(Set dst (SubVD src (LoadVector mem)));
 5916   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5917   ins_encode %{
 5918     int vlen_enc = vector_length_encoding(this);
 5919     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5920   %}
 5921   ins_pipe( pipe_slow );
 5922 %}
 5923 
 5924 // --------------------------------- MUL --------------------------------------
 5925 
 5926 // Byte vector mul
 5927 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5928   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5929   match(Set dst (MulVB src1 src2));
 5930   effect(TEMP dst, TEMP xtmp);
 5931   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5932   ins_encode %{
 5933     assert(UseSSE > 3, "required");
 5934     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5935     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5936     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5937     __ psllw($dst$$XMMRegister, 8);
 5938     __ psrlw($dst$$XMMRegister, 8);
 5939     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5940   %}
 5941   ins_pipe( pipe_slow );
 5942 %}
 5943 
 5944 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5945   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5946   match(Set dst (MulVB src1 src2));
 5947   effect(TEMP dst, TEMP xtmp);
 5948   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5949   ins_encode %{
 5950     assert(UseSSE > 3, "required");
 5951     // Odd-index elements
 5952     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5953     __ psrlw($dst$$XMMRegister, 8);
 5954     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5955     __ psrlw($xtmp$$XMMRegister, 8);
 5956     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5957     __ psllw($dst$$XMMRegister, 8);
 5958     // Even-index elements
 5959     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5960     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5961     __ psllw($xtmp$$XMMRegister, 8);
 5962     __ psrlw($xtmp$$XMMRegister, 8);
 5963     // Combine
 5964     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5965   %}
 5966   ins_pipe( pipe_slow );
 5967 %}
 5968 
 5969 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5970   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5971   match(Set dst (MulVB src1 src2));
 5972   effect(TEMP xtmp1, TEMP xtmp2);
 5973   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5974   ins_encode %{
 5975     int vlen_enc = vector_length_encoding(this);
 5976     // Odd-index elements
 5977     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5978     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5979     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5980     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5981     // Even-index elements
 5982     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5983     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5984     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5985     // Combine
 5986     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5987   %}
 5988   ins_pipe( pipe_slow );
 5989 %}
 5990 
 5991 // Shorts/Chars vector mul
 5992 instruct vmulS(vec dst, vec src) %{
 5993   predicate(UseAVX == 0);
 5994   match(Set dst (MulVS dst src));
 5995   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5996   ins_encode %{
 5997     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5998   %}
 5999   ins_pipe( pipe_slow );
 6000 %}
 6001 
 6002 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6003   predicate(UseAVX > 0);
 6004   match(Set dst (MulVS src1 src2));
 6005   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6006   ins_encode %{
 6007     int vlen_enc = vector_length_encoding(this);
 6008     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6009   %}
 6010   ins_pipe( pipe_slow );
 6011 %}
 6012 
 6013 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6014   predicate((UseAVX > 0) &&
 6015             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6016   match(Set dst (MulVS src (LoadVector mem)));
 6017   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6018   ins_encode %{
 6019     int vlen_enc = vector_length_encoding(this);
 6020     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6021   %}
 6022   ins_pipe( pipe_slow );
 6023 %}
 6024 
 6025 // Integers vector mul
 6026 instruct vmulI(vec dst, vec src) %{
 6027   predicate(UseAVX == 0);
 6028   match(Set dst (MulVI dst src));
 6029   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6030   ins_encode %{
 6031     assert(UseSSE > 3, "required");
 6032     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6033   %}
 6034   ins_pipe( pipe_slow );
 6035 %}
 6036 
 6037 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6038   predicate(UseAVX > 0);
 6039   match(Set dst (MulVI src1 src2));
 6040   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6041   ins_encode %{
 6042     int vlen_enc = vector_length_encoding(this);
 6043     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6044   %}
 6045   ins_pipe( pipe_slow );
 6046 %}
 6047 
 6048 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6049   predicate((UseAVX > 0) &&
 6050             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6051   match(Set dst (MulVI src (LoadVector mem)));
 6052   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6053   ins_encode %{
 6054     int vlen_enc = vector_length_encoding(this);
 6055     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6056   %}
 6057   ins_pipe( pipe_slow );
 6058 %}
 6059 
 6060 // Longs vector mul
 6061 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6062   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6063              VM_Version::supports_avx512dq()) ||
 6064             VM_Version::supports_avx512vldq());
 6065   match(Set dst (MulVL src1 src2));
 6066   ins_cost(500);
 6067   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6068   ins_encode %{
 6069     assert(UseAVX > 2, "required");
 6070     int vlen_enc = vector_length_encoding(this);
 6071     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6072   %}
 6073   ins_pipe( pipe_slow );
 6074 %}
 6075 
 6076 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6077   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6078              VM_Version::supports_avx512dq()) ||
 6079             (Matcher::vector_length_in_bytes(n) > 8 &&
 6080              VM_Version::supports_avx512vldq()));
 6081   match(Set dst (MulVL src (LoadVector mem)));
 6082   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6083   ins_cost(500);
 6084   ins_encode %{
 6085     assert(UseAVX > 2, "required");
 6086     int vlen_enc = vector_length_encoding(this);
 6087     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6088   %}
 6089   ins_pipe( pipe_slow );
 6090 %}
 6091 
 6092 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6093   predicate(UseAVX == 0);
 6094   match(Set dst (MulVL src1 src2));
 6095   ins_cost(500);
 6096   effect(TEMP dst, TEMP xtmp);
 6097   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6098   ins_encode %{
 6099     assert(VM_Version::supports_sse4_1(), "required");
 6100     // Get the lo-hi products, only the lower 32 bits is in concerns
 6101     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6102     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6103     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6104     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6105     __ psllq($dst$$XMMRegister, 32);
 6106     // Get the lo-lo products
 6107     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6108     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6109     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6110   %}
 6111   ins_pipe( pipe_slow );
 6112 %}
 6113 
 6114 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6115   predicate(UseAVX > 0 &&
 6116             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6117               !VM_Version::supports_avx512dq()) ||
 6118              (Matcher::vector_length_in_bytes(n) < 64 &&
 6119               !VM_Version::supports_avx512vldq())));
 6120   match(Set dst (MulVL src1 src2));
 6121   effect(TEMP xtmp1, TEMP xtmp2);
 6122   ins_cost(500);
 6123   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6124   ins_encode %{
 6125     int vlen_enc = vector_length_encoding(this);
 6126     // Get the lo-hi products, only the lower 32 bits is in concerns
 6127     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6128     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6129     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6130     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6131     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6132     // Get the lo-lo products
 6133     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6134     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6135   %}
 6136   ins_pipe( pipe_slow );
 6137 %}
 6138 
 6139 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6140   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6141   match(Set dst (MulVL src1 src2));
 6142   ins_cost(100);
 6143   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6144   ins_encode %{
 6145     int vlen_enc = vector_length_encoding(this);
 6146     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6147   %}
 6148   ins_pipe( pipe_slow );
 6149 %}
 6150 
 6151 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6152   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6153   match(Set dst (MulVL src1 src2));
 6154   ins_cost(100);
 6155   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6156   ins_encode %{
 6157     int vlen_enc = vector_length_encoding(this);
 6158     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6159   %}
 6160   ins_pipe( pipe_slow );
 6161 %}
 6162 
 6163 // Floats vector mul
 6164 instruct vmulF(vec dst, vec src) %{
 6165   predicate(UseAVX == 0);
 6166   match(Set dst (MulVF dst src));
 6167   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6168   ins_encode %{
 6169     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6170   %}
 6171   ins_pipe( pipe_slow );
 6172 %}
 6173 
 6174 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6175   predicate(UseAVX > 0);
 6176   match(Set dst (MulVF src1 src2));
 6177   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6178   ins_encode %{
 6179     int vlen_enc = vector_length_encoding(this);
 6180     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6181   %}
 6182   ins_pipe( pipe_slow );
 6183 %}
 6184 
 6185 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6186   predicate((UseAVX > 0) &&
 6187             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6188   match(Set dst (MulVF src (LoadVector mem)));
 6189   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6190   ins_encode %{
 6191     int vlen_enc = vector_length_encoding(this);
 6192     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6193   %}
 6194   ins_pipe( pipe_slow );
 6195 %}
 6196 
 6197 // Doubles vector mul
 6198 instruct vmulD(vec dst, vec src) %{
 6199   predicate(UseAVX == 0);
 6200   match(Set dst (MulVD dst src));
 6201   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6202   ins_encode %{
 6203     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6204   %}
 6205   ins_pipe( pipe_slow );
 6206 %}
 6207 
 6208 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6209   predicate(UseAVX > 0);
 6210   match(Set dst (MulVD src1 src2));
 6211   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6212   ins_encode %{
 6213     int vlen_enc = vector_length_encoding(this);
 6214     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6215   %}
 6216   ins_pipe( pipe_slow );
 6217 %}
 6218 
 6219 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6220   predicate((UseAVX > 0) &&
 6221             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6222   match(Set dst (MulVD src (LoadVector mem)));
 6223   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6224   ins_encode %{
 6225     int vlen_enc = vector_length_encoding(this);
 6226     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6227   %}
 6228   ins_pipe( pipe_slow );
 6229 %}
 6230 
 6231 // --------------------------------- DIV --------------------------------------
 6232 
 6233 // Floats vector div
 6234 instruct vdivF(vec dst, vec src) %{
 6235   predicate(UseAVX == 0);
 6236   match(Set dst (DivVF dst src));
 6237   format %{ "divps   $dst,$src\t! div packedF" %}
 6238   ins_encode %{
 6239     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6240   %}
 6241   ins_pipe( pipe_slow );
 6242 %}
 6243 
 6244 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6245   predicate(UseAVX > 0);
 6246   match(Set dst (DivVF src1 src2));
 6247   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6248   ins_encode %{
 6249     int vlen_enc = vector_length_encoding(this);
 6250     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6251   %}
 6252   ins_pipe( pipe_slow );
 6253 %}
 6254 
 6255 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6256   predicate((UseAVX > 0) &&
 6257             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6258   match(Set dst (DivVF src (LoadVector mem)));
 6259   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6260   ins_encode %{
 6261     int vlen_enc = vector_length_encoding(this);
 6262     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6263   %}
 6264   ins_pipe( pipe_slow );
 6265 %}
 6266 
 6267 // Doubles vector div
 6268 instruct vdivD(vec dst, vec src) %{
 6269   predicate(UseAVX == 0);
 6270   match(Set dst (DivVD dst src));
 6271   format %{ "divpd   $dst,$src\t! div packedD" %}
 6272   ins_encode %{
 6273     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6274   %}
 6275   ins_pipe( pipe_slow );
 6276 %}
 6277 
 6278 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6279   predicate(UseAVX > 0);
 6280   match(Set dst (DivVD src1 src2));
 6281   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6282   ins_encode %{
 6283     int vlen_enc = vector_length_encoding(this);
 6284     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6285   %}
 6286   ins_pipe( pipe_slow );
 6287 %}
 6288 
 6289 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6290   predicate((UseAVX > 0) &&
 6291             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6292   match(Set dst (DivVD src (LoadVector mem)));
 6293   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6294   ins_encode %{
 6295     int vlen_enc = vector_length_encoding(this);
 6296     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6297   %}
 6298   ins_pipe( pipe_slow );
 6299 %}
 6300 
 6301 // ------------------------------ MinMax ---------------------------------------
 6302 
 6303 // Byte, Short, Int vector Min/Max
 6304 instruct minmax_reg_sse(vec dst, vec src) %{
 6305   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6306             UseAVX == 0);
 6307   match(Set dst (MinV dst src));
 6308   match(Set dst (MaxV dst src));
 6309   format %{ "vector_minmax  $dst,$src\t!  " %}
 6310   ins_encode %{
 6311     assert(UseSSE >= 4, "required");
 6312 
 6313     int opcode = this->ideal_Opcode();
 6314     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6315     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6316   %}
 6317   ins_pipe( pipe_slow );
 6318 %}
 6319 
 6320 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6321   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6322             UseAVX > 0);
 6323   match(Set dst (MinV src1 src2));
 6324   match(Set dst (MaxV src1 src2));
 6325   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6326   ins_encode %{
 6327     int opcode = this->ideal_Opcode();
 6328     int vlen_enc = vector_length_encoding(this);
 6329     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6330 
 6331     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6332   %}
 6333   ins_pipe( pipe_slow );
 6334 %}
 6335 
 6336 // Long vector Min/Max
 6337 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6338   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6339             UseAVX == 0);
 6340   match(Set dst (MinV dst src));
 6341   match(Set dst (MaxV src dst));
 6342   effect(TEMP dst, TEMP tmp);
 6343   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6344   ins_encode %{
 6345     assert(UseSSE >= 4, "required");
 6346 
 6347     int opcode = this->ideal_Opcode();
 6348     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6349     assert(elem_bt == T_LONG, "sanity");
 6350 
 6351     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6352   %}
 6353   ins_pipe( pipe_slow );
 6354 %}
 6355 
 6356 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6357   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6358             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6359   match(Set dst (MinV src1 src2));
 6360   match(Set dst (MaxV src1 src2));
 6361   effect(TEMP dst);
 6362   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6363   ins_encode %{
 6364     int vlen_enc = vector_length_encoding(this);
 6365     int opcode = this->ideal_Opcode();
 6366     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6367     assert(elem_bt == T_LONG, "sanity");
 6368 
 6369     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6370   %}
 6371   ins_pipe( pipe_slow );
 6372 %}
 6373 
 6374 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6375   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6376             Matcher::vector_element_basic_type(n) == T_LONG);
 6377   match(Set dst (MinV src1 src2));
 6378   match(Set dst (MaxV src1 src2));
 6379   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6380   ins_encode %{
 6381     assert(UseAVX > 2, "required");
 6382 
 6383     int vlen_enc = vector_length_encoding(this);
 6384     int opcode = this->ideal_Opcode();
 6385     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6386     assert(elem_bt == T_LONG, "sanity");
 6387 
 6388     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6389   %}
 6390   ins_pipe( pipe_slow );
 6391 %}
 6392 
 6393 // Float/Double vector Min/Max
 6394 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6395   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6396             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6397             UseAVX > 0);
 6398   match(Set dst (MinV a b));
 6399   match(Set dst (MaxV a b));
 6400   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6401   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6402   ins_encode %{
 6403     assert(UseAVX > 0, "required");
 6404 
 6405     int opcode = this->ideal_Opcode();
 6406     int vlen_enc = vector_length_encoding(this);
 6407     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6408 
 6409     __ vminmax_fp(opcode, elem_bt,
 6410                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6411                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6412   %}
 6413   ins_pipe( pipe_slow );
 6414 %}
 6415 
 6416 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6417   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6418             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6419   match(Set dst (MinV a b));
 6420   match(Set dst (MaxV a b));
 6421   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6422   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6423   ins_encode %{
 6424     assert(UseAVX > 2, "required");
 6425 
 6426     int opcode = this->ideal_Opcode();
 6427     int vlen_enc = vector_length_encoding(this);
 6428     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6429 
 6430     __ evminmax_fp(opcode, elem_bt,
 6431                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6432                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6433   %}
 6434   ins_pipe( pipe_slow );
 6435 %}
 6436 
 6437 // ------------------------------ Unsigned vector Min/Max ----------------------
 6438 
 6439 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6440   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6441   match(Set dst (UMinV a b));
 6442   match(Set dst (UMaxV a b));
 6443   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6444   ins_encode %{
 6445     int opcode = this->ideal_Opcode();
 6446     int vlen_enc = vector_length_encoding(this);
 6447     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6448     assert(is_integral_type(elem_bt), "");
 6449     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6450   %}
 6451   ins_pipe( pipe_slow );
 6452 %}
 6453 
 6454 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6455   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6456   match(Set dst (UMinV a (LoadVector b)));
 6457   match(Set dst (UMaxV a (LoadVector b)));
 6458   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6459   ins_encode %{
 6460     int opcode = this->ideal_Opcode();
 6461     int vlen_enc = vector_length_encoding(this);
 6462     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6463     assert(is_integral_type(elem_bt), "");
 6464     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6465   %}
 6466   ins_pipe( pipe_slow );
 6467 %}
 6468 
 6469 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6470   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6471   match(Set dst (UMinV a b));
 6472   match(Set dst (UMaxV a b));
 6473   effect(TEMP xtmp1, TEMP xtmp2);
 6474   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6475   ins_encode %{
 6476     int opcode = this->ideal_Opcode();
 6477     int vlen_enc = vector_length_encoding(this);
 6478     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6479   %}
 6480   ins_pipe( pipe_slow );
 6481 %}
 6482 
 6483 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6484   match(Set dst (UMinV (Binary dst src2) mask));
 6485   match(Set dst (UMaxV (Binary dst src2) mask));
 6486   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6487   ins_encode %{
 6488     int vlen_enc = vector_length_encoding(this);
 6489     BasicType bt = Matcher::vector_element_basic_type(this);
 6490     int opc = this->ideal_Opcode();
 6491     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6492                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6493   %}
 6494   ins_pipe( pipe_slow );
 6495 %}
 6496 
 6497 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6498   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6499   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6500   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6501   ins_encode %{
 6502     int vlen_enc = vector_length_encoding(this);
 6503     BasicType bt = Matcher::vector_element_basic_type(this);
 6504     int opc = this->ideal_Opcode();
 6505     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6506                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6507   %}
 6508   ins_pipe( pipe_slow );
 6509 %}
 6510 
 6511 // --------------------------------- Signum/CopySign ---------------------------
 6512 
 6513 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6514   match(Set dst (SignumF dst (Binary zero one)));
 6515   effect(KILL cr);
 6516   format %{ "signumF $dst, $dst" %}
 6517   ins_encode %{
 6518     int opcode = this->ideal_Opcode();
 6519     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6520   %}
 6521   ins_pipe( pipe_slow );
 6522 %}
 6523 
 6524 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6525   match(Set dst (SignumD dst (Binary zero one)));
 6526   effect(KILL cr);
 6527   format %{ "signumD $dst, $dst" %}
 6528   ins_encode %{
 6529     int opcode = this->ideal_Opcode();
 6530     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6531   %}
 6532   ins_pipe( pipe_slow );
 6533 %}
 6534 
 6535 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6536   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6537   match(Set dst (SignumVF src (Binary zero one)));
 6538   match(Set dst (SignumVD src (Binary zero one)));
 6539   effect(TEMP dst, TEMP xtmp1);
 6540   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6541   ins_encode %{
 6542     int opcode = this->ideal_Opcode();
 6543     int vec_enc = vector_length_encoding(this);
 6544     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6545                          $xtmp1$$XMMRegister, vec_enc);
 6546   %}
 6547   ins_pipe( pipe_slow );
 6548 %}
 6549 
 6550 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6551   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6552   match(Set dst (SignumVF src (Binary zero one)));
 6553   match(Set dst (SignumVD src (Binary zero one)));
 6554   effect(TEMP dst, TEMP ktmp1);
 6555   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6556   ins_encode %{
 6557     int opcode = this->ideal_Opcode();
 6558     int vec_enc = vector_length_encoding(this);
 6559     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6560                           $ktmp1$$KRegister, vec_enc);
 6561   %}
 6562   ins_pipe( pipe_slow );
 6563 %}
 6564 
 6565 // ---------------------------------------
 6566 // For copySign use 0xE4 as writemask for vpternlog
 6567 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6568 // C (xmm2) is set to 0x7FFFFFFF
 6569 // Wherever xmm2 is 0, we want to pick from B (sign)
 6570 // Wherever xmm2 is 1, we want to pick from A (src)
 6571 //
 6572 // A B C Result
 6573 // 0 0 0 0
 6574 // 0 0 1 0
 6575 // 0 1 0 1
 6576 // 0 1 1 0
 6577 // 1 0 0 0
 6578 // 1 0 1 1
 6579 // 1 1 0 1
 6580 // 1 1 1 1
 6581 //
 6582 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6583 // ---------------------------------------
 6584 
 6585 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6586   match(Set dst (CopySignF dst src));
 6587   effect(TEMP tmp1, TEMP tmp2);
 6588   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6589   ins_encode %{
 6590     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6591     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6592     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6593   %}
 6594   ins_pipe( pipe_slow );
 6595 %}
 6596 
 6597 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6598   match(Set dst (CopySignD dst (Binary src zero)));
 6599   ins_cost(100);
 6600   effect(TEMP tmp1, TEMP tmp2);
 6601   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6602   ins_encode %{
 6603     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6604     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6605     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6606   %}
 6607   ins_pipe( pipe_slow );
 6608 %}
 6609 
 6610 //----------------------------- CompressBits/ExpandBits ------------------------
 6611 
 6612 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6613   predicate(n->bottom_type()->isa_int());
 6614   match(Set dst (CompressBits src mask));
 6615   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6616   ins_encode %{
 6617     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6618   %}
 6619   ins_pipe( pipe_slow );
 6620 %}
 6621 
 6622 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6623   predicate(n->bottom_type()->isa_int());
 6624   match(Set dst (ExpandBits src mask));
 6625   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6626   ins_encode %{
 6627     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6628   %}
 6629   ins_pipe( pipe_slow );
 6630 %}
 6631 
 6632 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6633   predicate(n->bottom_type()->isa_int());
 6634   match(Set dst (CompressBits src (LoadI mask)));
 6635   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6636   ins_encode %{
 6637     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6638   %}
 6639   ins_pipe( pipe_slow );
 6640 %}
 6641 
 6642 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6643   predicate(n->bottom_type()->isa_int());
 6644   match(Set dst (ExpandBits src (LoadI mask)));
 6645   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6646   ins_encode %{
 6647     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6648   %}
 6649   ins_pipe( pipe_slow );
 6650 %}
 6651 
 6652 // --------------------------------- Sqrt --------------------------------------
 6653 
 6654 instruct vsqrtF_reg(vec dst, vec src) %{
 6655   match(Set dst (SqrtVF src));
 6656   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6657   ins_encode %{
 6658     assert(UseAVX > 0, "required");
 6659     int vlen_enc = vector_length_encoding(this);
 6660     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6661   %}
 6662   ins_pipe( pipe_slow );
 6663 %}
 6664 
 6665 instruct vsqrtF_mem(vec dst, memory mem) %{
 6666   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6667   match(Set dst (SqrtVF (LoadVector mem)));
 6668   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6669   ins_encode %{
 6670     assert(UseAVX > 0, "required");
 6671     int vlen_enc = vector_length_encoding(this);
 6672     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6673   %}
 6674   ins_pipe( pipe_slow );
 6675 %}
 6676 
 6677 // Floating point vector sqrt
 6678 instruct vsqrtD_reg(vec dst, vec src) %{
 6679   match(Set dst (SqrtVD src));
 6680   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6681   ins_encode %{
 6682     assert(UseAVX > 0, "required");
 6683     int vlen_enc = vector_length_encoding(this);
 6684     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6685   %}
 6686   ins_pipe( pipe_slow );
 6687 %}
 6688 
 6689 instruct vsqrtD_mem(vec dst, memory mem) %{
 6690   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6691   match(Set dst (SqrtVD (LoadVector mem)));
 6692   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6693   ins_encode %{
 6694     assert(UseAVX > 0, "required");
 6695     int vlen_enc = vector_length_encoding(this);
 6696     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6697   %}
 6698   ins_pipe( pipe_slow );
 6699 %}
 6700 
 6701 // ------------------------------ Shift ---------------------------------------
 6702 
 6703 // Left and right shift count vectors are the same on x86
 6704 // (only lowest bits of xmm reg are used for count).
 6705 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6706   match(Set dst (LShiftCntV cnt));
 6707   match(Set dst (RShiftCntV cnt));
 6708   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6709   ins_encode %{
 6710     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6711   %}
 6712   ins_pipe( pipe_slow );
 6713 %}
 6714 
 6715 // Byte vector shift
 6716 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6717   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6718   match(Set dst ( LShiftVB src shift));
 6719   match(Set dst ( RShiftVB src shift));
 6720   match(Set dst (URShiftVB src shift));
 6721   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6722   format %{"vector_byte_shift $dst,$src,$shift" %}
 6723   ins_encode %{
 6724     assert(UseSSE > 3, "required");
 6725     int opcode = this->ideal_Opcode();
 6726     bool sign = (opcode != Op_URShiftVB);
 6727     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6728     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6729     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6730     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6731     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6732   %}
 6733   ins_pipe( pipe_slow );
 6734 %}
 6735 
 6736 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6737   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6738             UseAVX <= 1);
 6739   match(Set dst ( LShiftVB src shift));
 6740   match(Set dst ( RShiftVB src shift));
 6741   match(Set dst (URShiftVB src shift));
 6742   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6743   format %{"vector_byte_shift $dst,$src,$shift" %}
 6744   ins_encode %{
 6745     assert(UseSSE > 3, "required");
 6746     int opcode = this->ideal_Opcode();
 6747     bool sign = (opcode != Op_URShiftVB);
 6748     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6749     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6750     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6751     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6752     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6753     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6754     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6755     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6756     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6757   %}
 6758   ins_pipe( pipe_slow );
 6759 %}
 6760 
 6761 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6762   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6763             UseAVX > 1);
 6764   match(Set dst ( LShiftVB src shift));
 6765   match(Set dst ( RShiftVB src shift));
 6766   match(Set dst (URShiftVB src shift));
 6767   effect(TEMP dst, TEMP tmp);
 6768   format %{"vector_byte_shift $dst,$src,$shift" %}
 6769   ins_encode %{
 6770     int opcode = this->ideal_Opcode();
 6771     bool sign = (opcode != Op_URShiftVB);
 6772     int vlen_enc = Assembler::AVX_256bit;
 6773     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6774     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6775     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6776     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6777     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6778   %}
 6779   ins_pipe( pipe_slow );
 6780 %}
 6781 
 6782 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6783   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6784   match(Set dst ( LShiftVB src shift));
 6785   match(Set dst ( RShiftVB src shift));
 6786   match(Set dst (URShiftVB src shift));
 6787   effect(TEMP dst, TEMP tmp);
 6788   format %{"vector_byte_shift $dst,$src,$shift" %}
 6789   ins_encode %{
 6790     assert(UseAVX > 1, "required");
 6791     int opcode = this->ideal_Opcode();
 6792     bool sign = (opcode != Op_URShiftVB);
 6793     int vlen_enc = Assembler::AVX_256bit;
 6794     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6795     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6796     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6797     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6798     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6799     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6800     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6801     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6802     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6803   %}
 6804   ins_pipe( pipe_slow );
 6805 %}
 6806 
 6807 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6808   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6809   match(Set dst ( LShiftVB src shift));
 6810   match(Set dst  (RShiftVB src shift));
 6811   match(Set dst (URShiftVB src shift));
 6812   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6813   format %{"vector_byte_shift $dst,$src,$shift" %}
 6814   ins_encode %{
 6815     assert(UseAVX > 2, "required");
 6816     int opcode = this->ideal_Opcode();
 6817     bool sign = (opcode != Op_URShiftVB);
 6818     int vlen_enc = Assembler::AVX_512bit;
 6819     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6820     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6821     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6822     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6823     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6824     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6825     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6826     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6827     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6828     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6829     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6830     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6831   %}
 6832   ins_pipe( pipe_slow );
 6833 %}
 6834 
 6835 // Shorts vector logical right shift produces incorrect Java result
 6836 // for negative data because java code convert short value into int with
 6837 // sign extension before a shift. But char vectors are fine since chars are
 6838 // unsigned values.
 6839 // Shorts/Chars vector left shift
 6840 instruct vshiftS(vec dst, vec src, vec shift) %{
 6841   predicate(!n->as_ShiftV()->is_var_shift());
 6842   match(Set dst ( LShiftVS src shift));
 6843   match(Set dst ( RShiftVS src shift));
 6844   match(Set dst (URShiftVS src shift));
 6845   effect(TEMP dst, USE src, USE shift);
 6846   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6847   ins_encode %{
 6848     int opcode = this->ideal_Opcode();
 6849     if (UseAVX > 0) {
 6850       int vlen_enc = vector_length_encoding(this);
 6851       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6852     } else {
 6853       int vlen = Matcher::vector_length(this);
 6854       if (vlen == 2) {
 6855         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6856         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6857       } else if (vlen == 4) {
 6858         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6859         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6860       } else {
 6861         assert (vlen == 8, "sanity");
 6862         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6863         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6864       }
 6865     }
 6866   %}
 6867   ins_pipe( pipe_slow );
 6868 %}
 6869 
 6870 // Integers vector left shift
 6871 instruct vshiftI(vec dst, vec src, vec shift) %{
 6872   predicate(!n->as_ShiftV()->is_var_shift());
 6873   match(Set dst ( LShiftVI src shift));
 6874   match(Set dst ( RShiftVI src shift));
 6875   match(Set dst (URShiftVI src shift));
 6876   effect(TEMP dst, USE src, USE shift);
 6877   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6878   ins_encode %{
 6879     int opcode = this->ideal_Opcode();
 6880     if (UseAVX > 0) {
 6881       int vlen_enc = vector_length_encoding(this);
 6882       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6883     } else {
 6884       int vlen = Matcher::vector_length(this);
 6885       if (vlen == 2) {
 6886         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6887         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6888       } else {
 6889         assert(vlen == 4, "sanity");
 6890         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6891         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6892       }
 6893     }
 6894   %}
 6895   ins_pipe( pipe_slow );
 6896 %}
 6897 
 6898 // Integers vector left constant shift
 6899 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6900   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6901   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6902   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6903   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6904   ins_encode %{
 6905     int opcode = this->ideal_Opcode();
 6906     if (UseAVX > 0) {
 6907       int vector_len = vector_length_encoding(this);
 6908       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6909     } else {
 6910       int vlen = Matcher::vector_length(this);
 6911       if (vlen == 2) {
 6912         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6913         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6914       } else {
 6915         assert(vlen == 4, "sanity");
 6916         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6917         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6918       }
 6919     }
 6920   %}
 6921   ins_pipe( pipe_slow );
 6922 %}
 6923 
 6924 // Longs vector shift
 6925 instruct vshiftL(vec dst, vec src, vec shift) %{
 6926   predicate(!n->as_ShiftV()->is_var_shift());
 6927   match(Set dst ( LShiftVL src shift));
 6928   match(Set dst (URShiftVL src shift));
 6929   effect(TEMP dst, USE src, USE shift);
 6930   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6931   ins_encode %{
 6932     int opcode = this->ideal_Opcode();
 6933     if (UseAVX > 0) {
 6934       int vlen_enc = vector_length_encoding(this);
 6935       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6936     } else {
 6937       assert(Matcher::vector_length(this) == 2, "");
 6938       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6939       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6940     }
 6941   %}
 6942   ins_pipe( pipe_slow );
 6943 %}
 6944 
 6945 // Longs vector constant shift
 6946 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6947   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6948   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6949   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6950   ins_encode %{
 6951     int opcode = this->ideal_Opcode();
 6952     if (UseAVX > 0) {
 6953       int vector_len = vector_length_encoding(this);
 6954       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6955     } else {
 6956       assert(Matcher::vector_length(this) == 2, "");
 6957       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6958       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6959     }
 6960   %}
 6961   ins_pipe( pipe_slow );
 6962 %}
 6963 
 6964 // -------------------ArithmeticRightShift -----------------------------------
 6965 // Long vector arithmetic right shift
 6966 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6967   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6968   match(Set dst (RShiftVL src shift));
 6969   effect(TEMP dst, TEMP tmp);
 6970   format %{ "vshiftq $dst,$src,$shift" %}
 6971   ins_encode %{
 6972     uint vlen = Matcher::vector_length(this);
 6973     if (vlen == 2) {
 6974       assert(UseSSE >= 2, "required");
 6975       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6976       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6977       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6978       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6979       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6980       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6981     } else {
 6982       assert(vlen == 4, "sanity");
 6983       assert(UseAVX > 1, "required");
 6984       int vlen_enc = Assembler::AVX_256bit;
 6985       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6986       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6987       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6988       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6989       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6990     }
 6991   %}
 6992   ins_pipe( pipe_slow );
 6993 %}
 6994 
 6995 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6996   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6997   match(Set dst (RShiftVL src shift));
 6998   format %{ "vshiftq $dst,$src,$shift" %}
 6999   ins_encode %{
 7000     int vlen_enc = vector_length_encoding(this);
 7001     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7002   %}
 7003   ins_pipe( pipe_slow );
 7004 %}
 7005 
 7006 // ------------------- Variable Shift -----------------------------
 7007 // Byte variable shift
 7008 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7009   predicate(Matcher::vector_length(n) <= 8 &&
 7010             n->as_ShiftV()->is_var_shift() &&
 7011             !VM_Version::supports_avx512bw());
 7012   match(Set dst ( LShiftVB src shift));
 7013   match(Set dst ( RShiftVB src shift));
 7014   match(Set dst (URShiftVB src shift));
 7015   effect(TEMP dst, TEMP vtmp);
 7016   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7017   ins_encode %{
 7018     assert(UseAVX >= 2, "required");
 7019 
 7020     int opcode = this->ideal_Opcode();
 7021     int vlen_enc = Assembler::AVX_128bit;
 7022     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7023     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7024   %}
 7025   ins_pipe( pipe_slow );
 7026 %}
 7027 
 7028 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7029   predicate(Matcher::vector_length(n) == 16 &&
 7030             n->as_ShiftV()->is_var_shift() &&
 7031             !VM_Version::supports_avx512bw());
 7032   match(Set dst ( LShiftVB src shift));
 7033   match(Set dst ( RShiftVB src shift));
 7034   match(Set dst (URShiftVB src shift));
 7035   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7036   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7037   ins_encode %{
 7038     assert(UseAVX >= 2, "required");
 7039 
 7040     int opcode = this->ideal_Opcode();
 7041     int vlen_enc = Assembler::AVX_128bit;
 7042     // Shift lower half and get word result in dst
 7043     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7044 
 7045     // Shift upper half and get word result in vtmp1
 7046     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7047     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7048     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7049 
 7050     // Merge and down convert the two word results to byte in dst
 7051     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7052   %}
 7053   ins_pipe( pipe_slow );
 7054 %}
 7055 
 7056 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7057   predicate(Matcher::vector_length(n) == 32 &&
 7058             n->as_ShiftV()->is_var_shift() &&
 7059             !VM_Version::supports_avx512bw());
 7060   match(Set dst ( LShiftVB src shift));
 7061   match(Set dst ( RShiftVB src shift));
 7062   match(Set dst (URShiftVB src shift));
 7063   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7064   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7065   ins_encode %{
 7066     assert(UseAVX >= 2, "required");
 7067 
 7068     int opcode = this->ideal_Opcode();
 7069     int vlen_enc = Assembler::AVX_128bit;
 7070     // Process lower 128 bits and get result in dst
 7071     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7072     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7073     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7074     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7075     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7076 
 7077     // Process higher 128 bits and get result in vtmp3
 7078     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7079     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7080     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7081     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7082     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7083     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7084     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7085 
 7086     // Merge the two results in dst
 7087     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7088   %}
 7089   ins_pipe( pipe_slow );
 7090 %}
 7091 
 7092 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7093   predicate(Matcher::vector_length(n) <= 32 &&
 7094             n->as_ShiftV()->is_var_shift() &&
 7095             VM_Version::supports_avx512bw());
 7096   match(Set dst ( LShiftVB src shift));
 7097   match(Set dst ( RShiftVB src shift));
 7098   match(Set dst (URShiftVB src shift));
 7099   effect(TEMP dst, TEMP vtmp);
 7100   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7101   ins_encode %{
 7102     assert(UseAVX > 2, "required");
 7103 
 7104     int opcode = this->ideal_Opcode();
 7105     int vlen_enc = vector_length_encoding(this);
 7106     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7107   %}
 7108   ins_pipe( pipe_slow );
 7109 %}
 7110 
 7111 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7112   predicate(Matcher::vector_length(n) == 64 &&
 7113             n->as_ShiftV()->is_var_shift() &&
 7114             VM_Version::supports_avx512bw());
 7115   match(Set dst ( LShiftVB src shift));
 7116   match(Set dst ( RShiftVB src shift));
 7117   match(Set dst (URShiftVB src shift));
 7118   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7119   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7120   ins_encode %{
 7121     assert(UseAVX > 2, "required");
 7122 
 7123     int opcode = this->ideal_Opcode();
 7124     int vlen_enc = Assembler::AVX_256bit;
 7125     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7126     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7127     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7128     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7129     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7130   %}
 7131   ins_pipe( pipe_slow );
 7132 %}
 7133 
 7134 // Short variable shift
 7135 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7136   predicate(Matcher::vector_length(n) <= 8 &&
 7137             n->as_ShiftV()->is_var_shift() &&
 7138             !VM_Version::supports_avx512bw());
 7139   match(Set dst ( LShiftVS src shift));
 7140   match(Set dst ( RShiftVS src shift));
 7141   match(Set dst (URShiftVS src shift));
 7142   effect(TEMP dst, TEMP vtmp);
 7143   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7144   ins_encode %{
 7145     assert(UseAVX >= 2, "required");
 7146 
 7147     int opcode = this->ideal_Opcode();
 7148     bool sign = (opcode != Op_URShiftVS);
 7149     int vlen_enc = Assembler::AVX_256bit;
 7150     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7151     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7152     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7153     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7154     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7155     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7156   %}
 7157   ins_pipe( pipe_slow );
 7158 %}
 7159 
 7160 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7161   predicate(Matcher::vector_length(n) == 16 &&
 7162             n->as_ShiftV()->is_var_shift() &&
 7163             !VM_Version::supports_avx512bw());
 7164   match(Set dst ( LShiftVS src shift));
 7165   match(Set dst ( RShiftVS src shift));
 7166   match(Set dst (URShiftVS src shift));
 7167   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7168   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7169   ins_encode %{
 7170     assert(UseAVX >= 2, "required");
 7171 
 7172     int opcode = this->ideal_Opcode();
 7173     bool sign = (opcode != Op_URShiftVS);
 7174     int vlen_enc = Assembler::AVX_256bit;
 7175     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7176     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7177     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7178     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7179     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7180 
 7181     // Shift upper half, with result in dst using vtmp1 as TEMP
 7182     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7183     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7184     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7185     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7186     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7187     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7188 
 7189     // Merge lower and upper half result into dst
 7190     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7191     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7192   %}
 7193   ins_pipe( pipe_slow );
 7194 %}
 7195 
 7196 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7197   predicate(n->as_ShiftV()->is_var_shift() &&
 7198             VM_Version::supports_avx512bw());
 7199   match(Set dst ( LShiftVS src shift));
 7200   match(Set dst ( RShiftVS src shift));
 7201   match(Set dst (URShiftVS src shift));
 7202   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7203   ins_encode %{
 7204     assert(UseAVX > 2, "required");
 7205 
 7206     int opcode = this->ideal_Opcode();
 7207     int vlen_enc = vector_length_encoding(this);
 7208     if (!VM_Version::supports_avx512vl()) {
 7209       vlen_enc = Assembler::AVX_512bit;
 7210     }
 7211     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7212   %}
 7213   ins_pipe( pipe_slow );
 7214 %}
 7215 
 7216 //Integer variable shift
 7217 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7218   predicate(n->as_ShiftV()->is_var_shift());
 7219   match(Set dst ( LShiftVI src shift));
 7220   match(Set dst ( RShiftVI src shift));
 7221   match(Set dst (URShiftVI src shift));
 7222   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7223   ins_encode %{
 7224     assert(UseAVX >= 2, "required");
 7225 
 7226     int opcode = this->ideal_Opcode();
 7227     int vlen_enc = vector_length_encoding(this);
 7228     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7229   %}
 7230   ins_pipe( pipe_slow );
 7231 %}
 7232 
 7233 //Long variable shift
 7234 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7235   predicate(n->as_ShiftV()->is_var_shift());
 7236   match(Set dst ( LShiftVL src shift));
 7237   match(Set dst (URShiftVL src shift));
 7238   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7239   ins_encode %{
 7240     assert(UseAVX >= 2, "required");
 7241 
 7242     int opcode = this->ideal_Opcode();
 7243     int vlen_enc = vector_length_encoding(this);
 7244     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7245   %}
 7246   ins_pipe( pipe_slow );
 7247 %}
 7248 
 7249 //Long variable right shift arithmetic
 7250 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7251   predicate(Matcher::vector_length(n) <= 4 &&
 7252             n->as_ShiftV()->is_var_shift() &&
 7253             UseAVX == 2);
 7254   match(Set dst (RShiftVL src shift));
 7255   effect(TEMP dst, TEMP vtmp);
 7256   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7257   ins_encode %{
 7258     int opcode = this->ideal_Opcode();
 7259     int vlen_enc = vector_length_encoding(this);
 7260     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7261                  $vtmp$$XMMRegister);
 7262   %}
 7263   ins_pipe( pipe_slow );
 7264 %}
 7265 
 7266 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7267   predicate(n->as_ShiftV()->is_var_shift() &&
 7268             UseAVX > 2);
 7269   match(Set dst (RShiftVL src shift));
 7270   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7271   ins_encode %{
 7272     int opcode = this->ideal_Opcode();
 7273     int vlen_enc = vector_length_encoding(this);
 7274     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7275   %}
 7276   ins_pipe( pipe_slow );
 7277 %}
 7278 
 7279 // --------------------------------- AND --------------------------------------
 7280 
 7281 instruct vand(vec dst, vec src) %{
 7282   predicate(UseAVX == 0);
 7283   match(Set dst (AndV dst src));
 7284   format %{ "pand    $dst,$src\t! and vectors" %}
 7285   ins_encode %{
 7286     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7287   %}
 7288   ins_pipe( pipe_slow );
 7289 %}
 7290 
 7291 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7292   predicate(UseAVX > 0);
 7293   match(Set dst (AndV src1 src2));
 7294   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7295   ins_encode %{
 7296     int vlen_enc = vector_length_encoding(this);
 7297     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7298   %}
 7299   ins_pipe( pipe_slow );
 7300 %}
 7301 
 7302 instruct vand_mem(vec dst, vec src, memory mem) %{
 7303   predicate((UseAVX > 0) &&
 7304             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7305   match(Set dst (AndV src (LoadVector mem)));
 7306   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7307   ins_encode %{
 7308     int vlen_enc = vector_length_encoding(this);
 7309     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7310   %}
 7311   ins_pipe( pipe_slow );
 7312 %}
 7313 
 7314 // --------------------------------- OR ---------------------------------------
 7315 
 7316 instruct vor(vec dst, vec src) %{
 7317   predicate(UseAVX == 0);
 7318   match(Set dst (OrV dst src));
 7319   format %{ "por     $dst,$src\t! or vectors" %}
 7320   ins_encode %{
 7321     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7322   %}
 7323   ins_pipe( pipe_slow );
 7324 %}
 7325 
 7326 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7327   predicate(UseAVX > 0);
 7328   match(Set dst (OrV src1 src2));
 7329   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7330   ins_encode %{
 7331     int vlen_enc = vector_length_encoding(this);
 7332     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7333   %}
 7334   ins_pipe( pipe_slow );
 7335 %}
 7336 
 7337 instruct vor_mem(vec dst, vec src, memory mem) %{
 7338   predicate((UseAVX > 0) &&
 7339             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7340   match(Set dst (OrV src (LoadVector mem)));
 7341   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7342   ins_encode %{
 7343     int vlen_enc = vector_length_encoding(this);
 7344     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7345   %}
 7346   ins_pipe( pipe_slow );
 7347 %}
 7348 
 7349 // --------------------------------- XOR --------------------------------------
 7350 
 7351 instruct vxor(vec dst, vec src) %{
 7352   predicate(UseAVX == 0);
 7353   match(Set dst (XorV dst src));
 7354   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7355   ins_encode %{
 7356     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7357   %}
 7358   ins_pipe( pipe_slow );
 7359 %}
 7360 
 7361 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7362   predicate(UseAVX > 0);
 7363   match(Set dst (XorV src1 src2));
 7364   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7365   ins_encode %{
 7366     int vlen_enc = vector_length_encoding(this);
 7367     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7368   %}
 7369   ins_pipe( pipe_slow );
 7370 %}
 7371 
 7372 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7373   predicate((UseAVX > 0) &&
 7374             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7375   match(Set dst (XorV src (LoadVector mem)));
 7376   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7377   ins_encode %{
 7378     int vlen_enc = vector_length_encoding(this);
 7379     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7380   %}
 7381   ins_pipe( pipe_slow );
 7382 %}
 7383 
 7384 // --------------------------------- VectorCast --------------------------------------
 7385 
 7386 instruct vcastBtoX(vec dst, vec src) %{
 7387   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7388   match(Set dst (VectorCastB2X src));
 7389   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7390   ins_encode %{
 7391     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7392     int vlen_enc = vector_length_encoding(this);
 7393     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7394   %}
 7395   ins_pipe( pipe_slow );
 7396 %}
 7397 
 7398 instruct vcastBtoD(legVec dst, legVec src) %{
 7399   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7400   match(Set dst (VectorCastB2X src));
 7401   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7402   ins_encode %{
 7403     int vlen_enc = vector_length_encoding(this);
 7404     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7405   %}
 7406   ins_pipe( pipe_slow );
 7407 %}
 7408 
 7409 instruct castStoX(vec dst, vec src) %{
 7410   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7411             Matcher::vector_length(n->in(1)) <= 8 && // src
 7412             Matcher::vector_element_basic_type(n) == T_BYTE);
 7413   match(Set dst (VectorCastS2X src));
 7414   format %{ "vector_cast_s2x $dst,$src" %}
 7415   ins_encode %{
 7416     assert(UseAVX > 0, "required");
 7417 
 7418     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7419     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7420   %}
 7421   ins_pipe( pipe_slow );
 7422 %}
 7423 
 7424 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7425   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7426             Matcher::vector_length(n->in(1)) == 16 && // src
 7427             Matcher::vector_element_basic_type(n) == T_BYTE);
 7428   effect(TEMP dst, TEMP vtmp);
 7429   match(Set dst (VectorCastS2X src));
 7430   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7431   ins_encode %{
 7432     assert(UseAVX > 0, "required");
 7433 
 7434     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7435     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7436     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7437     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7438   %}
 7439   ins_pipe( pipe_slow );
 7440 %}
 7441 
 7442 instruct vcastStoX_evex(vec dst, vec src) %{
 7443   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7444             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7445   match(Set dst (VectorCastS2X src));
 7446   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7447   ins_encode %{
 7448     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7449     int src_vlen_enc = vector_length_encoding(this, $src);
 7450     int vlen_enc = vector_length_encoding(this);
 7451     switch (to_elem_bt) {
 7452       case T_BYTE:
 7453         if (!VM_Version::supports_avx512vl()) {
 7454           vlen_enc = Assembler::AVX_512bit;
 7455         }
 7456         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7457         break;
 7458       case T_INT:
 7459         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7460         break;
 7461       case T_FLOAT:
 7462         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7463         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7464         break;
 7465       case T_LONG:
 7466         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7467         break;
 7468       case T_DOUBLE: {
 7469         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7470         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7471         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7472         break;
 7473       }
 7474       default:
 7475         ShouldNotReachHere();
 7476     }
 7477   %}
 7478   ins_pipe( pipe_slow );
 7479 %}
 7480 
 7481 instruct castItoX(vec dst, vec src) %{
 7482   predicate(UseAVX <= 2 &&
 7483             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7484             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7485   match(Set dst (VectorCastI2X src));
 7486   format %{ "vector_cast_i2x $dst,$src" %}
 7487   ins_encode %{
 7488     assert(UseAVX > 0, "required");
 7489 
 7490     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7491     int vlen_enc = vector_length_encoding(this, $src);
 7492 
 7493     if (to_elem_bt == T_BYTE) {
 7494       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7495       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7496       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7497     } else {
 7498       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7499       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7500       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7501     }
 7502   %}
 7503   ins_pipe( pipe_slow );
 7504 %}
 7505 
 7506 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7507   predicate(UseAVX <= 2 &&
 7508             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7509             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7510   match(Set dst (VectorCastI2X src));
 7511   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7512   effect(TEMP dst, TEMP vtmp);
 7513   ins_encode %{
 7514     assert(UseAVX > 0, "required");
 7515 
 7516     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7517     int vlen_enc = vector_length_encoding(this, $src);
 7518 
 7519     if (to_elem_bt == T_BYTE) {
 7520       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7521       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7522       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7523       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7524     } else {
 7525       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7526       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7527       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7528       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7529     }
 7530   %}
 7531   ins_pipe( pipe_slow );
 7532 %}
 7533 
 7534 instruct vcastItoX_evex(vec dst, vec src) %{
 7535   predicate(UseAVX > 2 ||
 7536             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7537   match(Set dst (VectorCastI2X src));
 7538   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7539   ins_encode %{
 7540     assert(UseAVX > 0, "required");
 7541 
 7542     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7543     int src_vlen_enc = vector_length_encoding(this, $src);
 7544     int dst_vlen_enc = vector_length_encoding(this);
 7545     switch (dst_elem_bt) {
 7546       case T_BYTE:
 7547         if (!VM_Version::supports_avx512vl()) {
 7548           src_vlen_enc = Assembler::AVX_512bit;
 7549         }
 7550         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7551         break;
 7552       case T_SHORT:
 7553         if (!VM_Version::supports_avx512vl()) {
 7554           src_vlen_enc = Assembler::AVX_512bit;
 7555         }
 7556         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7557         break;
 7558       case T_FLOAT:
 7559         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7560         break;
 7561       case T_LONG:
 7562         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7563         break;
 7564       case T_DOUBLE:
 7565         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7566         break;
 7567       default:
 7568         ShouldNotReachHere();
 7569     }
 7570   %}
 7571   ins_pipe( pipe_slow );
 7572 %}
 7573 
 7574 instruct vcastLtoBS(vec dst, vec src) %{
 7575   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7576             UseAVX <= 2);
 7577   match(Set dst (VectorCastL2X src));
 7578   format %{ "vector_cast_l2x  $dst,$src" %}
 7579   ins_encode %{
 7580     assert(UseAVX > 0, "required");
 7581 
 7582     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7583     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7584     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7585                                                       : ExternalAddress(vector_int_to_short_mask());
 7586     if (vlen <= 16) {
 7587       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7588       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7589       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7590     } else {
 7591       assert(vlen <= 32, "required");
 7592       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7593       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7594       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7595       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7596     }
 7597     if (to_elem_bt == T_BYTE) {
 7598       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7599     }
 7600   %}
 7601   ins_pipe( pipe_slow );
 7602 %}
 7603 
 7604 instruct vcastLtoX_evex(vec dst, vec src) %{
 7605   predicate(UseAVX > 2 ||
 7606             (Matcher::vector_element_basic_type(n) == T_INT ||
 7607              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7608              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7609   match(Set dst (VectorCastL2X src));
 7610   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7611   ins_encode %{
 7612     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7613     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7614     int vlen_enc = vector_length_encoding(this, $src);
 7615     switch (to_elem_bt) {
 7616       case T_BYTE:
 7617         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7618           vlen_enc = Assembler::AVX_512bit;
 7619         }
 7620         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7621         break;
 7622       case T_SHORT:
 7623         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7624           vlen_enc = Assembler::AVX_512bit;
 7625         }
 7626         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7627         break;
 7628       case T_INT:
 7629         if (vlen == 8) {
 7630           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7631             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7632           }
 7633         } else if (vlen == 16) {
 7634           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7635         } else if (vlen == 32) {
 7636           if (UseAVX > 2) {
 7637             if (!VM_Version::supports_avx512vl()) {
 7638               vlen_enc = Assembler::AVX_512bit;
 7639             }
 7640             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7641           } else {
 7642             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7643             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7644           }
 7645         } else { // vlen == 64
 7646           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7647         }
 7648         break;
 7649       case T_FLOAT:
 7650         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7651         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7652         break;
 7653       case T_DOUBLE:
 7654         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7655         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7656         break;
 7657 
 7658       default: assert(false, "%s", type2name(to_elem_bt));
 7659     }
 7660   %}
 7661   ins_pipe( pipe_slow );
 7662 %}
 7663 
 7664 instruct vcastFtoD_reg(vec dst, vec src) %{
 7665   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7666   match(Set dst (VectorCastF2X src));
 7667   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7668   ins_encode %{
 7669     int vlen_enc = vector_length_encoding(this);
 7670     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7671   %}
 7672   ins_pipe( pipe_slow );
 7673 %}
 7674 
 7675 
 7676 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7677   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7678             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7679   match(Set dst (VectorCastF2X src));
 7680   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7681   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7682   ins_encode %{
 7683     int vlen_enc = vector_length_encoding(this, $src);
 7684     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7685     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7686     // 32 bit addresses for register indirect addressing mode since stub constants
 7687     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7688     // However, targets are free to increase this limit, but having a large code cache size
 7689     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7690     // cap we save a temporary register allocation which in limiting case can prevent
 7691     // spilling in high register pressure blocks.
 7692     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7693                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7694                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7695   %}
 7696   ins_pipe( pipe_slow );
 7697 %}
 7698 
 7699 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7700   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7701             is_integral_type(Matcher::vector_element_basic_type(n)));
 7702   match(Set dst (VectorCastF2X src));
 7703   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7704   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7705   ins_encode %{
 7706     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7707     if (to_elem_bt == T_LONG) {
 7708       int vlen_enc = vector_length_encoding(this);
 7709       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7710                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7711                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7712     } else {
 7713       int vlen_enc = vector_length_encoding(this, $src);
 7714       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7715                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7716                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7717     }
 7718   %}
 7719   ins_pipe( pipe_slow );
 7720 %}
 7721 
 7722 instruct vcastDtoF_reg(vec dst, vec src) %{
 7723   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7724   match(Set dst (VectorCastD2X src));
 7725   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7726   ins_encode %{
 7727     int vlen_enc = vector_length_encoding(this, $src);
 7728     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7729   %}
 7730   ins_pipe( pipe_slow );
 7731 %}
 7732 
 7733 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7734   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7735             is_integral_type(Matcher::vector_element_basic_type(n)));
 7736   match(Set dst (VectorCastD2X src));
 7737   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7738   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7739   ins_encode %{
 7740     int vlen_enc = vector_length_encoding(this, $src);
 7741     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7742     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7743                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7744                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7745   %}
 7746   ins_pipe( pipe_slow );
 7747 %}
 7748 
 7749 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7750   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7751             is_integral_type(Matcher::vector_element_basic_type(n)));
 7752   match(Set dst (VectorCastD2X src));
 7753   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7754   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7755   ins_encode %{
 7756     int vlen_enc = vector_length_encoding(this, $src);
 7757     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7758     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7759                               ExternalAddress(vector_float_signflip());
 7760     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7761                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7762   %}
 7763   ins_pipe( pipe_slow );
 7764 %}
 7765 
 7766 instruct vucast(vec dst, vec src) %{
 7767   match(Set dst (VectorUCastB2X src));
 7768   match(Set dst (VectorUCastS2X src));
 7769   match(Set dst (VectorUCastI2X src));
 7770   format %{ "vector_ucast $dst,$src\t!" %}
 7771   ins_encode %{
 7772     assert(UseAVX > 0, "required");
 7773 
 7774     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7775     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7776     int vlen_enc = vector_length_encoding(this);
 7777     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7778   %}
 7779   ins_pipe( pipe_slow );
 7780 %}
 7781 
 7782 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7783   predicate(!VM_Version::supports_avx512vl() &&
 7784             Matcher::vector_length_in_bytes(n) < 64 &&
 7785             Matcher::vector_element_basic_type(n) == T_INT);
 7786   match(Set dst (RoundVF src));
 7787   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7788   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7789   ins_encode %{
 7790     int vlen_enc = vector_length_encoding(this);
 7791     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7792     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7793                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7794                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7795   %}
 7796   ins_pipe( pipe_slow );
 7797 %}
 7798 
 7799 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7800   predicate((VM_Version::supports_avx512vl() ||
 7801              Matcher::vector_length_in_bytes(n) == 64) &&
 7802              Matcher::vector_element_basic_type(n) == T_INT);
 7803   match(Set dst (RoundVF src));
 7804   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7805   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7806   ins_encode %{
 7807     int vlen_enc = vector_length_encoding(this);
 7808     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7809     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7810                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7811                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7812   %}
 7813   ins_pipe( pipe_slow );
 7814 %}
 7815 
 7816 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7817   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7818   match(Set dst (RoundVD src));
 7819   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7820   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7821   ins_encode %{
 7822     int vlen_enc = vector_length_encoding(this);
 7823     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7824     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7825                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7826                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7827   %}
 7828   ins_pipe( pipe_slow );
 7829 %}
 7830 
 7831 // --------------------------------- VectorMaskCmp --------------------------------------
 7832 
 7833 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7834   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7835             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7836             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7837             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7838   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7839   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7840   ins_encode %{
 7841     int vlen_enc = vector_length_encoding(this, $src1);
 7842     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7843     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7844       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7845     } else {
 7846       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7847     }
 7848   %}
 7849   ins_pipe( pipe_slow );
 7850 %}
 7851 
 7852 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7853   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7854             n->bottom_type()->isa_vectmask() == nullptr &&
 7855             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7856   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7857   effect(TEMP ktmp);
 7858   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7859   ins_encode %{
 7860     int vlen_enc = Assembler::AVX_512bit;
 7861     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7862     KRegister mask = k0; // The comparison itself is not being masked.
 7863     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7864       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7865       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7866     } else {
 7867       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7868       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7869     }
 7870   %}
 7871   ins_pipe( pipe_slow );
 7872 %}
 7873 
 7874 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7875   predicate(n->bottom_type()->isa_vectmask() &&
 7876             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7877   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7878   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7879   ins_encode %{
 7880     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7881     int vlen_enc = vector_length_encoding(this, $src1);
 7882     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7883     KRegister mask = k0; // The comparison itself is not being masked.
 7884     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7885       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7886     } else {
 7887       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7888     }
 7889   %}
 7890   ins_pipe( pipe_slow );
 7891 %}
 7892 
 7893 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7894   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7895             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7896             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7897             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7898             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7899             (n->in(2)->get_int() == BoolTest::eq ||
 7900              n->in(2)->get_int() == BoolTest::lt ||
 7901              n->in(2)->get_int() == BoolTest::gt)); // cond
 7902   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7903   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7904   ins_encode %{
 7905     int vlen_enc = vector_length_encoding(this, $src1);
 7906     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7907     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7908     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7909   %}
 7910   ins_pipe( pipe_slow );
 7911 %}
 7912 
 7913 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7914   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7915             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7916             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7917             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7918             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7919             (n->in(2)->get_int() == BoolTest::ne ||
 7920              n->in(2)->get_int() == BoolTest::le ||
 7921              n->in(2)->get_int() == BoolTest::ge)); // cond
 7922   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7923   effect(TEMP dst, TEMP xtmp);
 7924   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7925   ins_encode %{
 7926     int vlen_enc = vector_length_encoding(this, $src1);
 7927     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7928     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7929     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7930   %}
 7931   ins_pipe( pipe_slow );
 7932 %}
 7933 
 7934 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7935   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7936             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7937             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7938             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7939             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7940   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7941   effect(TEMP dst, TEMP xtmp);
 7942   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7943   ins_encode %{
 7944     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7945     int vlen_enc = vector_length_encoding(this, $src1);
 7946     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7947     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7948 
 7949     if (vlen_enc == Assembler::AVX_128bit) {
 7950       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7951     } else {
 7952       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7953     }
 7954     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7955     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7956     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7957   %}
 7958   ins_pipe( pipe_slow );
 7959 %}
 7960 
 7961 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7962   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7963              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7964              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7965   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7966   effect(TEMP ktmp);
 7967   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7968   ins_encode %{
 7969     assert(UseAVX > 2, "required");
 7970 
 7971     int vlen_enc = vector_length_encoding(this, $src1);
 7972     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7973     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7974     KRegister mask = k0; // The comparison itself is not being masked.
 7975     bool merge = false;
 7976     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7977 
 7978     switch (src1_elem_bt) {
 7979       case T_INT: {
 7980         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7981         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7982         break;
 7983       }
 7984       case T_LONG: {
 7985         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7986         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7987         break;
 7988       }
 7989       default: assert(false, "%s", type2name(src1_elem_bt));
 7990     }
 7991   %}
 7992   ins_pipe( pipe_slow );
 7993 %}
 7994 
 7995 
 7996 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7997   predicate(n->bottom_type()->isa_vectmask() &&
 7998             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7999   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8000   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8001   ins_encode %{
 8002     assert(UseAVX > 2, "required");
 8003     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8004 
 8005     int vlen_enc = vector_length_encoding(this, $src1);
 8006     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8007     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8008     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8009 
 8010     // Comparison i
 8011     switch (src1_elem_bt) {
 8012       case T_BYTE: {
 8013         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8014         break;
 8015       }
 8016       case T_SHORT: {
 8017         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8018         break;
 8019       }
 8020       case T_INT: {
 8021         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8022         break;
 8023       }
 8024       case T_LONG: {
 8025         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8026         break;
 8027       }
 8028       default: assert(false, "%s", type2name(src1_elem_bt));
 8029     }
 8030   %}
 8031   ins_pipe( pipe_slow );
 8032 %}
 8033 
 8034 // Extract
 8035 
 8036 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8037   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8038   match(Set dst (ExtractI src idx));
 8039   match(Set dst (ExtractS src idx));
 8040   match(Set dst (ExtractB src idx));
 8041   format %{ "extractI $dst,$src,$idx\t!" %}
 8042   ins_encode %{
 8043     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8044 
 8045     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8046     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8047   %}
 8048   ins_pipe( pipe_slow );
 8049 %}
 8050 
 8051 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8052   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8053             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8054   match(Set dst (ExtractI src idx));
 8055   match(Set dst (ExtractS src idx));
 8056   match(Set dst (ExtractB src idx));
 8057   effect(TEMP vtmp);
 8058   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8059   ins_encode %{
 8060     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8061 
 8062     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8063     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8064     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8065   %}
 8066   ins_pipe( pipe_slow );
 8067 %}
 8068 
 8069 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8070   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8071   match(Set dst (ExtractL src idx));
 8072   format %{ "extractL $dst,$src,$idx\t!" %}
 8073   ins_encode %{
 8074     assert(UseSSE >= 4, "required");
 8075     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8076 
 8077     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8078   %}
 8079   ins_pipe( pipe_slow );
 8080 %}
 8081 
 8082 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8083   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8084             Matcher::vector_length(n->in(1)) == 8);  // src
 8085   match(Set dst (ExtractL src idx));
 8086   effect(TEMP vtmp);
 8087   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8088   ins_encode %{
 8089     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8090 
 8091     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8092     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8093   %}
 8094   ins_pipe( pipe_slow );
 8095 %}
 8096 
 8097 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8098   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8099   match(Set dst (ExtractF src idx));
 8100   effect(TEMP dst, TEMP vtmp);
 8101   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8102   ins_encode %{
 8103     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8104 
 8105     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8106   %}
 8107   ins_pipe( pipe_slow );
 8108 %}
 8109 
 8110 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8111   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8112             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8113   match(Set dst (ExtractF src idx));
 8114   effect(TEMP vtmp);
 8115   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8116   ins_encode %{
 8117     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8118 
 8119     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8120     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8121   %}
 8122   ins_pipe( pipe_slow );
 8123 %}
 8124 
 8125 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8126   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8127   match(Set dst (ExtractD src idx));
 8128   format %{ "extractD $dst,$src,$idx\t!" %}
 8129   ins_encode %{
 8130     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8131 
 8132     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8133   %}
 8134   ins_pipe( pipe_slow );
 8135 %}
 8136 
 8137 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8138   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8139             Matcher::vector_length(n->in(1)) == 8);  // src
 8140   match(Set dst (ExtractD src idx));
 8141   effect(TEMP vtmp);
 8142   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8143   ins_encode %{
 8144     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8145 
 8146     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8147     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8148   %}
 8149   ins_pipe( pipe_slow );
 8150 %}
 8151 
 8152 // --------------------------------- Vector Blend --------------------------------------
 8153 
 8154 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8155   predicate(UseAVX == 0);
 8156   match(Set dst (VectorBlend (Binary dst src) mask));
 8157   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8158   effect(TEMP tmp);
 8159   ins_encode %{
 8160     assert(UseSSE >= 4, "required");
 8161 
 8162     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8163       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8164     }
 8165     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8166   %}
 8167   ins_pipe( pipe_slow );
 8168 %}
 8169 
 8170 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8171   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8172             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8173             Matcher::vector_length_in_bytes(n) <= 32 &&
 8174             is_integral_type(Matcher::vector_element_basic_type(n)));
 8175   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8176   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8177   ins_encode %{
 8178     int vlen_enc = vector_length_encoding(this);
 8179     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8180   %}
 8181   ins_pipe( pipe_slow );
 8182 %}
 8183 
 8184 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8185   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8186             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8187             Matcher::vector_length_in_bytes(n) <= 32 &&
 8188             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8189   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8190   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8191   ins_encode %{
 8192     int vlen_enc = vector_length_encoding(this);
 8193     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8194   %}
 8195   ins_pipe( pipe_slow );
 8196 %}
 8197 
 8198 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8199   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8200             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8201             Matcher::vector_length_in_bytes(n) <= 32);
 8202   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8203   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8204   effect(TEMP vtmp, TEMP dst);
 8205   ins_encode %{
 8206     int vlen_enc = vector_length_encoding(this);
 8207     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8208     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8209     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8210   %}
 8211   ins_pipe( pipe_slow );
 8212 %}
 8213 
 8214 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8215   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8216             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8217   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8218   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8219   effect(TEMP ktmp);
 8220   ins_encode %{
 8221      int vlen_enc = Assembler::AVX_512bit;
 8222      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8223     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8224     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8225   %}
 8226   ins_pipe( pipe_slow );
 8227 %}
 8228 
 8229 
 8230 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8231   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8232             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8233              VM_Version::supports_avx512bw()));
 8234   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8235   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8236   ins_encode %{
 8237     int vlen_enc = vector_length_encoding(this);
 8238     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8239     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8240   %}
 8241   ins_pipe( pipe_slow );
 8242 %}
 8243 
 8244 // --------------------------------- ABS --------------------------------------
 8245 // a = |a|
 8246 instruct vabsB_reg(vec dst, vec src) %{
 8247   match(Set dst (AbsVB  src));
 8248   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8249   ins_encode %{
 8250     uint vlen = Matcher::vector_length(this);
 8251     if (vlen <= 16) {
 8252       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8253     } else {
 8254       int vlen_enc = vector_length_encoding(this);
 8255       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8256     }
 8257   %}
 8258   ins_pipe( pipe_slow );
 8259 %}
 8260 
 8261 instruct vabsS_reg(vec dst, vec src) %{
 8262   match(Set dst (AbsVS  src));
 8263   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8264   ins_encode %{
 8265     uint vlen = Matcher::vector_length(this);
 8266     if (vlen <= 8) {
 8267       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8268     } else {
 8269       int vlen_enc = vector_length_encoding(this);
 8270       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8271     }
 8272   %}
 8273   ins_pipe( pipe_slow );
 8274 %}
 8275 
 8276 instruct vabsI_reg(vec dst, vec src) %{
 8277   match(Set dst (AbsVI  src));
 8278   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8279   ins_encode %{
 8280     uint vlen = Matcher::vector_length(this);
 8281     if (vlen <= 4) {
 8282       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8283     } else {
 8284       int vlen_enc = vector_length_encoding(this);
 8285       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8286     }
 8287   %}
 8288   ins_pipe( pipe_slow );
 8289 %}
 8290 
 8291 instruct vabsL_reg(vec dst, vec src) %{
 8292   match(Set dst (AbsVL  src));
 8293   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8294   ins_encode %{
 8295     assert(UseAVX > 2, "required");
 8296     int vlen_enc = vector_length_encoding(this);
 8297     if (!VM_Version::supports_avx512vl()) {
 8298       vlen_enc = Assembler::AVX_512bit;
 8299     }
 8300     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8301   %}
 8302   ins_pipe( pipe_slow );
 8303 %}
 8304 
 8305 // --------------------------------- ABSNEG --------------------------------------
 8306 
 8307 instruct vabsnegF(vec dst, vec src) %{
 8308   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8309   match(Set dst (AbsVF src));
 8310   match(Set dst (NegVF src));
 8311   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8312   ins_cost(150);
 8313   ins_encode %{
 8314     int opcode = this->ideal_Opcode();
 8315     int vlen = Matcher::vector_length(this);
 8316     if (vlen == 2) {
 8317       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8318     } else {
 8319       assert(vlen == 8 || vlen == 16, "required");
 8320       int vlen_enc = vector_length_encoding(this);
 8321       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8322     }
 8323   %}
 8324   ins_pipe( pipe_slow );
 8325 %}
 8326 
 8327 instruct vabsneg4F(vec dst) %{
 8328   predicate(Matcher::vector_length(n) == 4);
 8329   match(Set dst (AbsVF dst));
 8330   match(Set dst (NegVF dst));
 8331   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8332   ins_cost(150);
 8333   ins_encode %{
 8334     int opcode = this->ideal_Opcode();
 8335     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8336   %}
 8337   ins_pipe( pipe_slow );
 8338 %}
 8339 
 8340 instruct vabsnegD(vec dst, vec src) %{
 8341   match(Set dst (AbsVD  src));
 8342   match(Set dst (NegVD  src));
 8343   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8344   ins_encode %{
 8345     int opcode = this->ideal_Opcode();
 8346     uint vlen = Matcher::vector_length(this);
 8347     if (vlen == 2) {
 8348       assert(UseSSE >= 2, "required");
 8349       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8350     } else {
 8351       int vlen_enc = vector_length_encoding(this);
 8352       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8353     }
 8354   %}
 8355   ins_pipe( pipe_slow );
 8356 %}
 8357 
 8358 //------------------------------------- VectorTest --------------------------------------------
 8359 
 8360 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8361   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8362   match(Set cr (VectorTest src1 src2));
 8363   effect(TEMP vtmp);
 8364   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8365   ins_encode %{
 8366     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8367     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8368     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8369   %}
 8370   ins_pipe( pipe_slow );
 8371 %}
 8372 
 8373 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8374   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8375   match(Set cr (VectorTest src1 src2));
 8376   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8377   ins_encode %{
 8378     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8379     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8380     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8381   %}
 8382   ins_pipe( pipe_slow );
 8383 %}
 8384 
 8385 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8386   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8387              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8388             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8389   match(Set cr (VectorTest src1 src2));
 8390   effect(TEMP tmp);
 8391   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8392   ins_encode %{
 8393     uint masklen = Matcher::vector_length(this, $src1);
 8394     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8395     __ andl($tmp$$Register, (1 << masklen) - 1);
 8396     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8397   %}
 8398   ins_pipe( pipe_slow );
 8399 %}
 8400 
 8401 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8402   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8403              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8404             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8405   match(Set cr (VectorTest src1 src2));
 8406   effect(TEMP tmp);
 8407   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8408   ins_encode %{
 8409     uint masklen = Matcher::vector_length(this, $src1);
 8410     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8411     __ andl($tmp$$Register, (1 << masklen) - 1);
 8412   %}
 8413   ins_pipe( pipe_slow );
 8414 %}
 8415 
 8416 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8417   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8418             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8419   match(Set cr (VectorTest src1 src2));
 8420   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8421   ins_encode %{
 8422     uint masklen = Matcher::vector_length(this, $src1);
 8423     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8424   %}
 8425   ins_pipe( pipe_slow );
 8426 %}
 8427 
 8428 //------------------------------------- LoadMask --------------------------------------------
 8429 
 8430 instruct loadMask(legVec dst, legVec src) %{
 8431   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8432   match(Set dst (VectorLoadMask src));
 8433   effect(TEMP dst);
 8434   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8435   ins_encode %{
 8436     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8437     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8438     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8439   %}
 8440   ins_pipe( pipe_slow );
 8441 %}
 8442 
 8443 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8444   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8445   match(Set dst (VectorLoadMask src));
 8446   effect(TEMP xtmp);
 8447   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8448   ins_encode %{
 8449     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8450                         true, Assembler::AVX_512bit);
 8451   %}
 8452   ins_pipe( pipe_slow );
 8453 %}
 8454 
 8455 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8456   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8457   match(Set dst (VectorLoadMask src));
 8458   effect(TEMP xtmp);
 8459   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8460   ins_encode %{
 8461     int vlen_enc = vector_length_encoding(in(1));
 8462     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8463                         false, vlen_enc);
 8464   %}
 8465   ins_pipe( pipe_slow );
 8466 %}
 8467 
 8468 //------------------------------------- StoreMask --------------------------------------------
 8469 
 8470 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8471   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8472   match(Set dst (VectorStoreMask src size));
 8473   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8474   ins_encode %{
 8475     int vlen = Matcher::vector_length(this);
 8476     if (vlen <= 16 && UseAVX <= 2) {
 8477       assert(UseSSE >= 3, "required");
 8478       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8479     } else {
 8480       assert(UseAVX > 0, "required");
 8481       int src_vlen_enc = vector_length_encoding(this, $src);
 8482       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8483     }
 8484   %}
 8485   ins_pipe( pipe_slow );
 8486 %}
 8487 
 8488 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8489   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8490   match(Set dst (VectorStoreMask src size));
 8491   effect(TEMP_DEF dst, TEMP xtmp);
 8492   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8493   ins_encode %{
 8494     int vlen_enc = Assembler::AVX_128bit;
 8495     int vlen = Matcher::vector_length(this);
 8496     if (vlen <= 8) {
 8497       assert(UseSSE >= 3, "required");
 8498       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8499       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8500       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8501     } else {
 8502       assert(UseAVX > 0, "required");
 8503       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8504       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8505       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8506     }
 8507   %}
 8508   ins_pipe( pipe_slow );
 8509 %}
 8510 
 8511 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8512   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8513   match(Set dst (VectorStoreMask src size));
 8514   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8515   effect(TEMP_DEF dst, TEMP xtmp);
 8516   ins_encode %{
 8517     int vlen_enc = Assembler::AVX_128bit;
 8518     int vlen = Matcher::vector_length(this);
 8519     if (vlen <= 4) {
 8520       assert(UseSSE >= 3, "required");
 8521       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8522       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8523       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8524       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8525     } else {
 8526       assert(UseAVX > 0, "required");
 8527       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8528       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8529       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8530       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8531       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8532     }
 8533   %}
 8534   ins_pipe( pipe_slow );
 8535 %}
 8536 
 8537 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8538   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8539   match(Set dst (VectorStoreMask src size));
 8540   effect(TEMP_DEF dst, TEMP xtmp);
 8541   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8542   ins_encode %{
 8543     assert(UseSSE >= 3, "required");
 8544     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8545     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8546     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8547     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8548     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8549   %}
 8550   ins_pipe( pipe_slow );
 8551 %}
 8552 
 8553 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8554   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8555   match(Set dst (VectorStoreMask src size));
 8556   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8557   effect(TEMP_DEF dst, TEMP vtmp);
 8558   ins_encode %{
 8559     int vlen_enc = Assembler::AVX_128bit;
 8560     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8561     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8562     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8563     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8564     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8565     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8566     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8567   %}
 8568   ins_pipe( pipe_slow );
 8569 %}
 8570 
 8571 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8572   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8573   match(Set dst (VectorStoreMask src size));
 8574   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8575   ins_encode %{
 8576     int src_vlen_enc = vector_length_encoding(this, $src);
 8577     int dst_vlen_enc = vector_length_encoding(this);
 8578     if (!VM_Version::supports_avx512vl()) {
 8579       src_vlen_enc = Assembler::AVX_512bit;
 8580     }
 8581     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8582     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8583   %}
 8584   ins_pipe( pipe_slow );
 8585 %}
 8586 
 8587 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8588   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8589   match(Set dst (VectorStoreMask src size));
 8590   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8591   ins_encode %{
 8592     int src_vlen_enc = vector_length_encoding(this, $src);
 8593     int dst_vlen_enc = vector_length_encoding(this);
 8594     if (!VM_Version::supports_avx512vl()) {
 8595       src_vlen_enc = Assembler::AVX_512bit;
 8596     }
 8597     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8598     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8599   %}
 8600   ins_pipe( pipe_slow );
 8601 %}
 8602 
 8603 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8604   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8605   match(Set dst (VectorStoreMask mask size));
 8606   effect(TEMP_DEF dst);
 8607   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8608   ins_encode %{
 8609     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8610     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8611                  false, Assembler::AVX_512bit, noreg);
 8612     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8613   %}
 8614   ins_pipe( pipe_slow );
 8615 %}
 8616 
 8617 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8618   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8619   match(Set dst (VectorStoreMask mask size));
 8620   effect(TEMP_DEF dst);
 8621   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8622   ins_encode %{
 8623     int dst_vlen_enc = vector_length_encoding(this);
 8624     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8625     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8626   %}
 8627   ins_pipe( pipe_slow );
 8628 %}
 8629 
 8630 instruct vmaskcast_evex(kReg dst) %{
 8631   match(Set dst (VectorMaskCast dst));
 8632   ins_cost(0);
 8633   format %{ "vector_mask_cast $dst" %}
 8634   ins_encode %{
 8635     // empty
 8636   %}
 8637   ins_pipe(empty);
 8638 %}
 8639 
 8640 instruct vmaskcast(vec dst) %{
 8641   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8642   match(Set dst (VectorMaskCast dst));
 8643   ins_cost(0);
 8644   format %{ "vector_mask_cast $dst" %}
 8645   ins_encode %{
 8646     // empty
 8647   %}
 8648   ins_pipe(empty);
 8649 %}
 8650 
 8651 instruct vmaskcast_avx(vec dst, vec src) %{
 8652   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8653   match(Set dst (VectorMaskCast src));
 8654   format %{ "vector_mask_cast $dst, $src" %}
 8655   ins_encode %{
 8656     int vlen = Matcher::vector_length(this);
 8657     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8658     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8659     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8660   %}
 8661   ins_pipe(pipe_slow);
 8662 %}
 8663 
 8664 //-------------------------------- Load Iota Indices ----------------------------------
 8665 
 8666 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8667   match(Set dst (VectorLoadConst src));
 8668   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8669   ins_encode %{
 8670      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8671      BasicType bt = Matcher::vector_element_basic_type(this);
 8672      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8673   %}
 8674   ins_pipe( pipe_slow );
 8675 %}
 8676 
 8677 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8678   match(Set dst (PopulateIndex src1 src2));
 8679   effect(TEMP dst, TEMP vtmp);
 8680   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8681   ins_encode %{
 8682      assert($src2$$constant == 1, "required");
 8683      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8684      int vlen_enc = vector_length_encoding(this);
 8685      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8686      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8687      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8688      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8689   %}
 8690   ins_pipe( pipe_slow );
 8691 %}
 8692 
 8693 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8694   match(Set dst (PopulateIndex src1 src2));
 8695   effect(TEMP dst, TEMP vtmp);
 8696   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8697   ins_encode %{
 8698      assert($src2$$constant == 1, "required");
 8699      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8700      int vlen_enc = vector_length_encoding(this);
 8701      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8702      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8703      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8704      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8705   %}
 8706   ins_pipe( pipe_slow );
 8707 %}
 8708 
 8709 //-------------------------------- Rearrange ----------------------------------
 8710 
 8711 // LoadShuffle/Rearrange for Byte
 8712 instruct rearrangeB(vec dst, vec shuffle) %{
 8713   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8714             Matcher::vector_length(n) < 32);
 8715   match(Set dst (VectorRearrange dst shuffle));
 8716   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8717   ins_encode %{
 8718     assert(UseSSE >= 4, "required");
 8719     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8720   %}
 8721   ins_pipe( pipe_slow );
 8722 %}
 8723 
 8724 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8725   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8726             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8727   match(Set dst (VectorRearrange src shuffle));
 8728   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8729   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8730   ins_encode %{
 8731     assert(UseAVX >= 2, "required");
 8732     // Swap src into vtmp1
 8733     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8734     // Shuffle swapped src to get entries from other 128 bit lane
 8735     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8736     // Shuffle original src to get entries from self 128 bit lane
 8737     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8738     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8739     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8740     // Perform the blend
 8741     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8742   %}
 8743   ins_pipe( pipe_slow );
 8744 %}
 8745 
 8746 
 8747 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8748   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8749             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8750   match(Set dst (VectorRearrange src shuffle));
 8751   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8752   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8753   ins_encode %{
 8754     int vlen_enc = vector_length_encoding(this);
 8755     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8756                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8757                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8758   %}
 8759   ins_pipe( pipe_slow );
 8760 %}
 8761 
 8762 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8763   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8764             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8765   match(Set dst (VectorRearrange src shuffle));
 8766   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8767   ins_encode %{
 8768     int vlen_enc = vector_length_encoding(this);
 8769     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8770   %}
 8771   ins_pipe( pipe_slow );
 8772 %}
 8773 
 8774 // LoadShuffle/Rearrange for Short
 8775 
 8776 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8777   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8778             !VM_Version::supports_avx512bw());
 8779   match(Set dst (VectorLoadShuffle src));
 8780   effect(TEMP dst, TEMP vtmp);
 8781   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8782   ins_encode %{
 8783     // Create a byte shuffle mask from short shuffle mask
 8784     // only byte shuffle instruction available on these platforms
 8785     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8786     if (UseAVX == 0) {
 8787       assert(vlen_in_bytes <= 16, "required");
 8788       // Multiply each shuffle by two to get byte index
 8789       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8790       __ psllw($vtmp$$XMMRegister, 1);
 8791 
 8792       // Duplicate to create 2 copies of byte index
 8793       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8794       __ psllw($dst$$XMMRegister, 8);
 8795       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8796 
 8797       // Add one to get alternate byte index
 8798       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8799       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8800     } else {
 8801       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8802       int vlen_enc = vector_length_encoding(this);
 8803       // Multiply each shuffle by two to get byte index
 8804       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8805 
 8806       // Duplicate to create 2 copies of byte index
 8807       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8808       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8809 
 8810       // Add one to get alternate byte index
 8811       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8812     }
 8813   %}
 8814   ins_pipe( pipe_slow );
 8815 %}
 8816 
 8817 instruct rearrangeS(vec dst, vec shuffle) %{
 8818   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8819             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8820   match(Set dst (VectorRearrange dst shuffle));
 8821   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8822   ins_encode %{
 8823     assert(UseSSE >= 4, "required");
 8824     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8825   %}
 8826   ins_pipe( pipe_slow );
 8827 %}
 8828 
 8829 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8830   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8831             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8832   match(Set dst (VectorRearrange src shuffle));
 8833   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8834   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8835   ins_encode %{
 8836     assert(UseAVX >= 2, "required");
 8837     // Swap src into vtmp1
 8838     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8839     // Shuffle swapped src to get entries from other 128 bit lane
 8840     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8841     // Shuffle original src to get entries from self 128 bit lane
 8842     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8843     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8844     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8845     // Perform the blend
 8846     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8847   %}
 8848   ins_pipe( pipe_slow );
 8849 %}
 8850 
 8851 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8852   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8853             VM_Version::supports_avx512bw());
 8854   match(Set dst (VectorRearrange src shuffle));
 8855   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8856   ins_encode %{
 8857     int vlen_enc = vector_length_encoding(this);
 8858     if (!VM_Version::supports_avx512vl()) {
 8859       vlen_enc = Assembler::AVX_512bit;
 8860     }
 8861     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8862   %}
 8863   ins_pipe( pipe_slow );
 8864 %}
 8865 
 8866 // LoadShuffle/Rearrange for Integer and Float
 8867 
 8868 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8869   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8870             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8871   match(Set dst (VectorLoadShuffle src));
 8872   effect(TEMP dst, TEMP vtmp);
 8873   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8874   ins_encode %{
 8875     assert(UseSSE >= 4, "required");
 8876 
 8877     // Create a byte shuffle mask from int shuffle mask
 8878     // only byte shuffle instruction available on these platforms
 8879 
 8880     // Duplicate and multiply each shuffle by 4
 8881     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8882     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8883     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8884     __ psllw($vtmp$$XMMRegister, 2);
 8885 
 8886     // Duplicate again to create 4 copies of byte index
 8887     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8888     __ psllw($dst$$XMMRegister, 8);
 8889     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8890 
 8891     // Add 3,2,1,0 to get alternate byte index
 8892     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8893     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8894   %}
 8895   ins_pipe( pipe_slow );
 8896 %}
 8897 
 8898 instruct rearrangeI(vec dst, vec shuffle) %{
 8899   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8900             UseAVX == 0);
 8901   match(Set dst (VectorRearrange dst shuffle));
 8902   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8903   ins_encode %{
 8904     assert(UseSSE >= 4, "required");
 8905     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8906   %}
 8907   ins_pipe( pipe_slow );
 8908 %}
 8909 
 8910 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8911   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8912             UseAVX > 0);
 8913   match(Set dst (VectorRearrange src shuffle));
 8914   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8915   ins_encode %{
 8916     int vlen_enc = vector_length_encoding(this);
 8917     BasicType bt = Matcher::vector_element_basic_type(this);
 8918     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8919   %}
 8920   ins_pipe( pipe_slow );
 8921 %}
 8922 
 8923 // LoadShuffle/Rearrange for Long and Double
 8924 
 8925 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8926   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8927             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8928   match(Set dst (VectorLoadShuffle src));
 8929   effect(TEMP dst, TEMP vtmp);
 8930   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8931   ins_encode %{
 8932     assert(UseAVX >= 2, "required");
 8933 
 8934     int vlen_enc = vector_length_encoding(this);
 8935     // Create a double word shuffle mask from long shuffle mask
 8936     // only double word shuffle instruction available on these platforms
 8937 
 8938     // Multiply each shuffle by two to get double word index
 8939     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8940 
 8941     // Duplicate each double word shuffle
 8942     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8943     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8944 
 8945     // Add one to get alternate double word index
 8946     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8947   %}
 8948   ins_pipe( pipe_slow );
 8949 %}
 8950 
 8951 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8952   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8953             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8954   match(Set dst (VectorRearrange src shuffle));
 8955   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8956   ins_encode %{
 8957     assert(UseAVX >= 2, "required");
 8958 
 8959     int vlen_enc = vector_length_encoding(this);
 8960     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8961   %}
 8962   ins_pipe( pipe_slow );
 8963 %}
 8964 
 8965 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8966   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8967             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8968   match(Set dst (VectorRearrange src shuffle));
 8969   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8970   ins_encode %{
 8971     assert(UseAVX > 2, "required");
 8972 
 8973     int vlen_enc = vector_length_encoding(this);
 8974     if (vlen_enc == Assembler::AVX_128bit) {
 8975       vlen_enc = Assembler::AVX_256bit;
 8976     }
 8977     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8978   %}
 8979   ins_pipe( pipe_slow );
 8980 %}
 8981 
 8982 // --------------------------------- FMA --------------------------------------
 8983 // a * b + c
 8984 
 8985 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8986   match(Set c (FmaVF  c (Binary a b)));
 8987   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8988   ins_cost(150);
 8989   ins_encode %{
 8990     assert(UseFMA, "not enabled");
 8991     int vlen_enc = vector_length_encoding(this);
 8992     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8993   %}
 8994   ins_pipe( pipe_slow );
 8995 %}
 8996 
 8997 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8998   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8999   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9000   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9001   ins_cost(150);
 9002   ins_encode %{
 9003     assert(UseFMA, "not enabled");
 9004     int vlen_enc = vector_length_encoding(this);
 9005     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9006   %}
 9007   ins_pipe( pipe_slow );
 9008 %}
 9009 
 9010 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9011   match(Set c (FmaVD  c (Binary a b)));
 9012   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9013   ins_cost(150);
 9014   ins_encode %{
 9015     assert(UseFMA, "not enabled");
 9016     int vlen_enc = vector_length_encoding(this);
 9017     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9018   %}
 9019   ins_pipe( pipe_slow );
 9020 %}
 9021 
 9022 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9023   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9024   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9025   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9026   ins_cost(150);
 9027   ins_encode %{
 9028     assert(UseFMA, "not enabled");
 9029     int vlen_enc = vector_length_encoding(this);
 9030     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9031   %}
 9032   ins_pipe( pipe_slow );
 9033 %}
 9034 
 9035 // --------------------------------- Vector Multiply Add --------------------------------------
 9036 
 9037 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9038   predicate(UseAVX == 0);
 9039   match(Set dst (MulAddVS2VI dst src1));
 9040   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9041   ins_encode %{
 9042     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9043   %}
 9044   ins_pipe( pipe_slow );
 9045 %}
 9046 
 9047 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9048   predicate(UseAVX > 0);
 9049   match(Set dst (MulAddVS2VI src1 src2));
 9050   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9051   ins_encode %{
 9052     int vlen_enc = vector_length_encoding(this);
 9053     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9054   %}
 9055   ins_pipe( pipe_slow );
 9056 %}
 9057 
 9058 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9059 
 9060 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9061   predicate(VM_Version::supports_avx512_vnni());
 9062   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9063   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9064   ins_encode %{
 9065     assert(UseAVX > 2, "required");
 9066     int vlen_enc = vector_length_encoding(this);
 9067     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9068   %}
 9069   ins_pipe( pipe_slow );
 9070   ins_cost(10);
 9071 %}
 9072 
 9073 // --------------------------------- PopCount --------------------------------------
 9074 
 9075 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9076   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9077   match(Set dst (PopCountVI src));
 9078   match(Set dst (PopCountVL src));
 9079   format %{ "vector_popcount_integral $dst, $src" %}
 9080   ins_encode %{
 9081     int opcode = this->ideal_Opcode();
 9082     int vlen_enc = vector_length_encoding(this, $src);
 9083     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9084     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9085   %}
 9086   ins_pipe( pipe_slow );
 9087 %}
 9088 
 9089 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9090   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9091   match(Set dst (PopCountVI src mask));
 9092   match(Set dst (PopCountVL src mask));
 9093   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9094   ins_encode %{
 9095     int vlen_enc = vector_length_encoding(this, $src);
 9096     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9097     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9098     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9099   %}
 9100   ins_pipe( pipe_slow );
 9101 %}
 9102 
 9103 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9104   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9105   match(Set dst (PopCountVI src));
 9106   match(Set dst (PopCountVL src));
 9107   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9108   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9109   ins_encode %{
 9110     int opcode = this->ideal_Opcode();
 9111     int vlen_enc = vector_length_encoding(this, $src);
 9112     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9113     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9114                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9115   %}
 9116   ins_pipe( pipe_slow );
 9117 %}
 9118 
 9119 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9120 
 9121 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9122   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9123                                               Matcher::vector_length_in_bytes(n->in(1))));
 9124   match(Set dst (CountTrailingZerosV src));
 9125   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9126   ins_cost(400);
 9127   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9128   ins_encode %{
 9129     int vlen_enc = vector_length_encoding(this, $src);
 9130     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9131     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9132                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9133   %}
 9134   ins_pipe( pipe_slow );
 9135 %}
 9136 
 9137 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9138   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9139             VM_Version::supports_avx512cd() &&
 9140             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9141   match(Set dst (CountTrailingZerosV src));
 9142   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9143   ins_cost(400);
 9144   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9145   ins_encode %{
 9146     int vlen_enc = vector_length_encoding(this, $src);
 9147     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9148     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9149                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9150   %}
 9151   ins_pipe( pipe_slow );
 9152 %}
 9153 
 9154 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9155   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9156   match(Set dst (CountTrailingZerosV src));
 9157   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9158   ins_cost(400);
 9159   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9160   ins_encode %{
 9161     int vlen_enc = vector_length_encoding(this, $src);
 9162     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9163     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9164                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9165                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9166   %}
 9167   ins_pipe( pipe_slow );
 9168 %}
 9169 
 9170 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9171   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9172   match(Set dst (CountTrailingZerosV src));
 9173   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9174   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9175   ins_encode %{
 9176     int vlen_enc = vector_length_encoding(this, $src);
 9177     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9178     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9179                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9180   %}
 9181   ins_pipe( pipe_slow );
 9182 %}
 9183 
 9184 
 9185 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9186 
 9187 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9188   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9189   effect(TEMP dst);
 9190   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9191   ins_encode %{
 9192     int vector_len = vector_length_encoding(this);
 9193     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9194   %}
 9195   ins_pipe( pipe_slow );
 9196 %}
 9197 
 9198 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9199   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9200   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9201   effect(TEMP dst);
 9202   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9203   ins_encode %{
 9204     int vector_len = vector_length_encoding(this);
 9205     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9206   %}
 9207   ins_pipe( pipe_slow );
 9208 %}
 9209 
 9210 // --------------------------------- Rotation Operations ----------------------------------
 9211 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9212   match(Set dst (RotateLeftV src shift));
 9213   match(Set dst (RotateRightV src shift));
 9214   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9215   ins_encode %{
 9216     int opcode      = this->ideal_Opcode();
 9217     int vector_len  = vector_length_encoding(this);
 9218     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9219     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9220   %}
 9221   ins_pipe( pipe_slow );
 9222 %}
 9223 
 9224 instruct vprorate(vec dst, vec src, vec shift) %{
 9225   match(Set dst (RotateLeftV src shift));
 9226   match(Set dst (RotateRightV src shift));
 9227   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9228   ins_encode %{
 9229     int opcode      = this->ideal_Opcode();
 9230     int vector_len  = vector_length_encoding(this);
 9231     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9232     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9233   %}
 9234   ins_pipe( pipe_slow );
 9235 %}
 9236 
 9237 // ---------------------------------- Masked Operations ------------------------------------
 9238 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9239   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9240   match(Set dst (LoadVectorMasked mem mask));
 9241   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9242   ins_encode %{
 9243     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9244     int vlen_enc = vector_length_encoding(this);
 9245     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9246   %}
 9247   ins_pipe( pipe_slow );
 9248 %}
 9249 
 9250 
 9251 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9252   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9253   match(Set dst (LoadVectorMasked mem mask));
 9254   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9255   ins_encode %{
 9256     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9257     int vector_len = vector_length_encoding(this);
 9258     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9259   %}
 9260   ins_pipe( pipe_slow );
 9261 %}
 9262 
 9263 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9264   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9265   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9266   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9267   ins_encode %{
 9268     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9269     int vlen_enc = vector_length_encoding(src_node);
 9270     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9271     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9272   %}
 9273   ins_pipe( pipe_slow );
 9274 %}
 9275 
 9276 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9277   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9278   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9279   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9280   ins_encode %{
 9281     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9282     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9283     int vlen_enc = vector_length_encoding(src_node);
 9284     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9285   %}
 9286   ins_pipe( pipe_slow );
 9287 %}
 9288 
 9289 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9290   match(Set addr (VerifyVectorAlignment addr mask));
 9291   effect(KILL cr);
 9292   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9293   ins_encode %{
 9294     Label Lskip;
 9295     // check if masked bits of addr are zero
 9296     __ testq($addr$$Register, $mask$$constant);
 9297     __ jccb(Assembler::equal, Lskip);
 9298     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9299     __ bind(Lskip);
 9300   %}
 9301   ins_pipe(pipe_slow);
 9302 %}
 9303 
 9304 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9305   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9306   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9307   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9308   ins_encode %{
 9309     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9310     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9311 
 9312     Label DONE;
 9313     int vlen_enc = vector_length_encoding(this, $src1);
 9314     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9315 
 9316     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9317     __ mov64($dst$$Register, -1L);
 9318     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9319     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9320     __ jccb(Assembler::carrySet, DONE);
 9321     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9322     __ notq($dst$$Register);
 9323     __ tzcntq($dst$$Register, $dst$$Register);
 9324     __ bind(DONE);
 9325   %}
 9326   ins_pipe( pipe_slow );
 9327 %}
 9328 
 9329 
 9330 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9331   match(Set dst (VectorMaskGen len));
 9332   effect(TEMP temp, KILL cr);
 9333   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9334   ins_encode %{
 9335     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9336   %}
 9337   ins_pipe( pipe_slow );
 9338 %}
 9339 
 9340 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9341   match(Set dst (VectorMaskGen len));
 9342   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9343   effect(TEMP temp);
 9344   ins_encode %{
 9345     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9346     __ kmovql($dst$$KRegister, $temp$$Register);
 9347   %}
 9348   ins_pipe( pipe_slow );
 9349 %}
 9350 
 9351 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9352   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9353   match(Set dst (VectorMaskToLong mask));
 9354   effect(TEMP dst, KILL cr);
 9355   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9356   ins_encode %{
 9357     int opcode = this->ideal_Opcode();
 9358     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9359     int mask_len = Matcher::vector_length(this, $mask);
 9360     int mask_size = mask_len * type2aelembytes(mbt);
 9361     int vlen_enc = vector_length_encoding(this, $mask);
 9362     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9363                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9364   %}
 9365   ins_pipe( pipe_slow );
 9366 %}
 9367 
 9368 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9369   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9370   match(Set dst (VectorMaskToLong mask));
 9371   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9372   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9373   ins_encode %{
 9374     int opcode = this->ideal_Opcode();
 9375     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9376     int mask_len = Matcher::vector_length(this, $mask);
 9377     int vlen_enc = vector_length_encoding(this, $mask);
 9378     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9379                              $dst$$Register, mask_len, mbt, vlen_enc);
 9380   %}
 9381   ins_pipe( pipe_slow );
 9382 %}
 9383 
 9384 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9385   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9386   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9387   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9388   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9389   ins_encode %{
 9390     int opcode = this->ideal_Opcode();
 9391     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9392     int mask_len = Matcher::vector_length(this, $mask);
 9393     int vlen_enc = vector_length_encoding(this, $mask);
 9394     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9395                              $dst$$Register, mask_len, mbt, vlen_enc);
 9396   %}
 9397   ins_pipe( pipe_slow );
 9398 %}
 9399 
 9400 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9401   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9402   match(Set dst (VectorMaskTrueCount mask));
 9403   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9404   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9405   ins_encode %{
 9406     int opcode = this->ideal_Opcode();
 9407     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9408     int mask_len = Matcher::vector_length(this, $mask);
 9409     int mask_size = mask_len * type2aelembytes(mbt);
 9410     int vlen_enc = vector_length_encoding(this, $mask);
 9411     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9412                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9413   %}
 9414   ins_pipe( pipe_slow );
 9415 %}
 9416 
 9417 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9418   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9419   match(Set dst (VectorMaskTrueCount mask));
 9420   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9421   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9422   ins_encode %{
 9423     int opcode = this->ideal_Opcode();
 9424     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9425     int mask_len = Matcher::vector_length(this, $mask);
 9426     int vlen_enc = vector_length_encoding(this, $mask);
 9427     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9428                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9429   %}
 9430   ins_pipe( pipe_slow );
 9431 %}
 9432 
 9433 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9434   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9435   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9436   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9437   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9438   ins_encode %{
 9439     int opcode = this->ideal_Opcode();
 9440     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9441     int mask_len = Matcher::vector_length(this, $mask);
 9442     int vlen_enc = vector_length_encoding(this, $mask);
 9443     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9444                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9445   %}
 9446   ins_pipe( pipe_slow );
 9447 %}
 9448 
 9449 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9450   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9451   match(Set dst (VectorMaskFirstTrue mask));
 9452   match(Set dst (VectorMaskLastTrue mask));
 9453   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9454   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9455   ins_encode %{
 9456     int opcode = this->ideal_Opcode();
 9457     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9458     int mask_len = Matcher::vector_length(this, $mask);
 9459     int mask_size = mask_len * type2aelembytes(mbt);
 9460     int vlen_enc = vector_length_encoding(this, $mask);
 9461     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9462                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9463   %}
 9464   ins_pipe( pipe_slow );
 9465 %}
 9466 
 9467 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9468   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9469   match(Set dst (VectorMaskFirstTrue mask));
 9470   match(Set dst (VectorMaskLastTrue mask));
 9471   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9472   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9473   ins_encode %{
 9474     int opcode = this->ideal_Opcode();
 9475     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9476     int mask_len = Matcher::vector_length(this, $mask);
 9477     int vlen_enc = vector_length_encoding(this, $mask);
 9478     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9479                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9480   %}
 9481   ins_pipe( pipe_slow );
 9482 %}
 9483 
 9484 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9485   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9486   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9487   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9488   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9489   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9490   ins_encode %{
 9491     int opcode = this->ideal_Opcode();
 9492     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9493     int mask_len = Matcher::vector_length(this, $mask);
 9494     int vlen_enc = vector_length_encoding(this, $mask);
 9495     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9496                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9497   %}
 9498   ins_pipe( pipe_slow );
 9499 %}
 9500 
 9501 // --------------------------------- Compress/Expand Operations ---------------------------
 9502 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9503   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9504   match(Set dst (CompressV src mask));
 9505   match(Set dst (ExpandV src mask));
 9506   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9507   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9508   ins_encode %{
 9509     int opcode = this->ideal_Opcode();
 9510     int vlen_enc = vector_length_encoding(this);
 9511     BasicType bt  = Matcher::vector_element_basic_type(this);
 9512     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9513                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9514   %}
 9515   ins_pipe( pipe_slow );
 9516 %}
 9517 
 9518 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9519   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9520   match(Set dst (CompressV src mask));
 9521   match(Set dst (ExpandV src mask));
 9522   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9523   ins_encode %{
 9524     int opcode = this->ideal_Opcode();
 9525     int vector_len = vector_length_encoding(this);
 9526     BasicType bt  = Matcher::vector_element_basic_type(this);
 9527     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9528   %}
 9529   ins_pipe( pipe_slow );
 9530 %}
 9531 
 9532 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9533   match(Set dst (CompressM mask));
 9534   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9535   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9536   ins_encode %{
 9537     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9538     int mask_len = Matcher::vector_length(this);
 9539     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9540   %}
 9541   ins_pipe( pipe_slow );
 9542 %}
 9543 
 9544 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9545 
 9546 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9547   predicate(!VM_Version::supports_gfni());
 9548   match(Set dst (ReverseV src));
 9549   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9550   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9551   ins_encode %{
 9552     int vec_enc = vector_length_encoding(this);
 9553     BasicType bt = Matcher::vector_element_basic_type(this);
 9554     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9555                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9556   %}
 9557   ins_pipe( pipe_slow );
 9558 %}
 9559 
 9560 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9561   predicate(VM_Version::supports_gfni());
 9562   match(Set dst (ReverseV src));
 9563   effect(TEMP dst, TEMP xtmp);
 9564   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9565   ins_encode %{
 9566     int vec_enc = vector_length_encoding(this);
 9567     BasicType bt  = Matcher::vector_element_basic_type(this);
 9568     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9569     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9570                                $xtmp$$XMMRegister);
 9571   %}
 9572   ins_pipe( pipe_slow );
 9573 %}
 9574 
 9575 instruct vreverse_byte_reg(vec dst, vec src) %{
 9576   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9577   match(Set dst (ReverseBytesV src));
 9578   effect(TEMP dst);
 9579   format %{ "vector_reverse_byte $dst, $src" %}
 9580   ins_encode %{
 9581     int vec_enc = vector_length_encoding(this);
 9582     BasicType bt = Matcher::vector_element_basic_type(this);
 9583     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9584   %}
 9585   ins_pipe( pipe_slow );
 9586 %}
 9587 
 9588 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9589   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9590   match(Set dst (ReverseBytesV src));
 9591   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9592   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9593   ins_encode %{
 9594     int vec_enc = vector_length_encoding(this);
 9595     BasicType bt = Matcher::vector_element_basic_type(this);
 9596     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9597                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9598   %}
 9599   ins_pipe( pipe_slow );
 9600 %}
 9601 
 9602 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9603 
 9604 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9605   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9606                                               Matcher::vector_length_in_bytes(n->in(1))));
 9607   match(Set dst (CountLeadingZerosV src));
 9608   format %{ "vector_count_leading_zeros $dst, $src" %}
 9609   ins_encode %{
 9610      int vlen_enc = vector_length_encoding(this, $src);
 9611      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9612      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9613                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9614   %}
 9615   ins_pipe( pipe_slow );
 9616 %}
 9617 
 9618 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9619   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9620                                               Matcher::vector_length_in_bytes(n->in(1))));
 9621   match(Set dst (CountLeadingZerosV src mask));
 9622   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9623   ins_encode %{
 9624     int vlen_enc = vector_length_encoding(this, $src);
 9625     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9626     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9627     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9628                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9629   %}
 9630   ins_pipe( pipe_slow );
 9631 %}
 9632 
 9633 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9634   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9635             VM_Version::supports_avx512cd() &&
 9636             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9637   match(Set dst (CountLeadingZerosV src));
 9638   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9639   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9640   ins_encode %{
 9641     int vlen_enc = vector_length_encoding(this, $src);
 9642     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9643     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9644                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9645   %}
 9646   ins_pipe( pipe_slow );
 9647 %}
 9648 
 9649 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9650   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9651   match(Set dst (CountLeadingZerosV src));
 9652   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9653   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9654   ins_encode %{
 9655     int vlen_enc = vector_length_encoding(this, $src);
 9656     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9657     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9658                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9659                                        $rtmp$$Register, true, vlen_enc);
 9660   %}
 9661   ins_pipe( pipe_slow );
 9662 %}
 9663 
 9664 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9665   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9666             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9667   match(Set dst (CountLeadingZerosV src));
 9668   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9669   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9670   ins_encode %{
 9671     int vlen_enc = vector_length_encoding(this, $src);
 9672     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9673     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9674                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9675   %}
 9676   ins_pipe( pipe_slow );
 9677 %}
 9678 
 9679 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9680   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9681             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9682   match(Set dst (CountLeadingZerosV src));
 9683   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9684   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9685   ins_encode %{
 9686     int vlen_enc = vector_length_encoding(this, $src);
 9687     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9688     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9689                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9690   %}
 9691   ins_pipe( pipe_slow );
 9692 %}
 9693 
 9694 // ---------------------------------- Vector Masked Operations ------------------------------------
 9695 
 9696 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9697   match(Set dst (AddVB (Binary dst src2) mask));
 9698   match(Set dst (AddVS (Binary dst src2) mask));
 9699   match(Set dst (AddVI (Binary dst src2) mask));
 9700   match(Set dst (AddVL (Binary dst src2) mask));
 9701   match(Set dst (AddVF (Binary dst src2) mask));
 9702   match(Set dst (AddVD (Binary dst src2) mask));
 9703   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9704   ins_encode %{
 9705     int vlen_enc = vector_length_encoding(this);
 9706     BasicType bt = Matcher::vector_element_basic_type(this);
 9707     int opc = this->ideal_Opcode();
 9708     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9709                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9710   %}
 9711   ins_pipe( pipe_slow );
 9712 %}
 9713 
 9714 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9715   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9716   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9717   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9718   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9719   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9720   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9721   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9722   ins_encode %{
 9723     int vlen_enc = vector_length_encoding(this);
 9724     BasicType bt = Matcher::vector_element_basic_type(this);
 9725     int opc = this->ideal_Opcode();
 9726     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9727                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9728   %}
 9729   ins_pipe( pipe_slow );
 9730 %}
 9731 
 9732 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9733   match(Set dst (XorV (Binary dst src2) mask));
 9734   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9735   ins_encode %{
 9736     int vlen_enc = vector_length_encoding(this);
 9737     BasicType bt = Matcher::vector_element_basic_type(this);
 9738     int opc = this->ideal_Opcode();
 9739     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9740                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9741   %}
 9742   ins_pipe( pipe_slow );
 9743 %}
 9744 
 9745 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9746   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9747   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9748   ins_encode %{
 9749     int vlen_enc = vector_length_encoding(this);
 9750     BasicType bt = Matcher::vector_element_basic_type(this);
 9751     int opc = this->ideal_Opcode();
 9752     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9753                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9754   %}
 9755   ins_pipe( pipe_slow );
 9756 %}
 9757 
 9758 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9759   match(Set dst (OrV (Binary dst src2) mask));
 9760   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9761   ins_encode %{
 9762     int vlen_enc = vector_length_encoding(this);
 9763     BasicType bt = Matcher::vector_element_basic_type(this);
 9764     int opc = this->ideal_Opcode();
 9765     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9766                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9767   %}
 9768   ins_pipe( pipe_slow );
 9769 %}
 9770 
 9771 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9772   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9773   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9774   ins_encode %{
 9775     int vlen_enc = vector_length_encoding(this);
 9776     BasicType bt = Matcher::vector_element_basic_type(this);
 9777     int opc = this->ideal_Opcode();
 9778     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9779                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9780   %}
 9781   ins_pipe( pipe_slow );
 9782 %}
 9783 
 9784 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9785   match(Set dst (AndV (Binary dst src2) mask));
 9786   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9787   ins_encode %{
 9788     int vlen_enc = vector_length_encoding(this);
 9789     BasicType bt = Matcher::vector_element_basic_type(this);
 9790     int opc = this->ideal_Opcode();
 9791     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9792                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9793   %}
 9794   ins_pipe( pipe_slow );
 9795 %}
 9796 
 9797 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9798   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9799   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9800   ins_encode %{
 9801     int vlen_enc = vector_length_encoding(this);
 9802     BasicType bt = Matcher::vector_element_basic_type(this);
 9803     int opc = this->ideal_Opcode();
 9804     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9805                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9806   %}
 9807   ins_pipe( pipe_slow );
 9808 %}
 9809 
 9810 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9811   match(Set dst (SubVB (Binary dst src2) mask));
 9812   match(Set dst (SubVS (Binary dst src2) mask));
 9813   match(Set dst (SubVI (Binary dst src2) mask));
 9814   match(Set dst (SubVL (Binary dst src2) mask));
 9815   match(Set dst (SubVF (Binary dst src2) mask));
 9816   match(Set dst (SubVD (Binary dst src2) mask));
 9817   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9818   ins_encode %{
 9819     int vlen_enc = vector_length_encoding(this);
 9820     BasicType bt = Matcher::vector_element_basic_type(this);
 9821     int opc = this->ideal_Opcode();
 9822     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9823                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9824   %}
 9825   ins_pipe( pipe_slow );
 9826 %}
 9827 
 9828 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9829   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9830   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9831   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9832   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9833   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9834   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9835   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9836   ins_encode %{
 9837     int vlen_enc = vector_length_encoding(this);
 9838     BasicType bt = Matcher::vector_element_basic_type(this);
 9839     int opc = this->ideal_Opcode();
 9840     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9841                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9842   %}
 9843   ins_pipe( pipe_slow );
 9844 %}
 9845 
 9846 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9847   match(Set dst (MulVS (Binary dst src2) mask));
 9848   match(Set dst (MulVI (Binary dst src2) mask));
 9849   match(Set dst (MulVL (Binary dst src2) mask));
 9850   match(Set dst (MulVF (Binary dst src2) mask));
 9851   match(Set dst (MulVD (Binary dst src2) mask));
 9852   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9853   ins_encode %{
 9854     int vlen_enc = vector_length_encoding(this);
 9855     BasicType bt = Matcher::vector_element_basic_type(this);
 9856     int opc = this->ideal_Opcode();
 9857     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9858                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9859   %}
 9860   ins_pipe( pipe_slow );
 9861 %}
 9862 
 9863 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9864   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9865   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9866   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9867   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9868   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9869   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9870   ins_encode %{
 9871     int vlen_enc = vector_length_encoding(this);
 9872     BasicType bt = Matcher::vector_element_basic_type(this);
 9873     int opc = this->ideal_Opcode();
 9874     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9875                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9876   %}
 9877   ins_pipe( pipe_slow );
 9878 %}
 9879 
 9880 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9881   match(Set dst (SqrtVF dst mask));
 9882   match(Set dst (SqrtVD dst mask));
 9883   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9884   ins_encode %{
 9885     int vlen_enc = vector_length_encoding(this);
 9886     BasicType bt = Matcher::vector_element_basic_type(this);
 9887     int opc = this->ideal_Opcode();
 9888     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9889                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9890   %}
 9891   ins_pipe( pipe_slow );
 9892 %}
 9893 
 9894 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9895   match(Set dst (DivVF (Binary dst src2) mask));
 9896   match(Set dst (DivVD (Binary dst src2) mask));
 9897   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9898   ins_encode %{
 9899     int vlen_enc = vector_length_encoding(this);
 9900     BasicType bt = Matcher::vector_element_basic_type(this);
 9901     int opc = this->ideal_Opcode();
 9902     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9903                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9904   %}
 9905   ins_pipe( pipe_slow );
 9906 %}
 9907 
 9908 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9909   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9910   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9911   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9912   ins_encode %{
 9913     int vlen_enc = vector_length_encoding(this);
 9914     BasicType bt = Matcher::vector_element_basic_type(this);
 9915     int opc = this->ideal_Opcode();
 9916     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9917                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9918   %}
 9919   ins_pipe( pipe_slow );
 9920 %}
 9921 
 9922 
 9923 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9924   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9925   match(Set dst (RotateRightV (Binary dst shift) mask));
 9926   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9927   ins_encode %{
 9928     int vlen_enc = vector_length_encoding(this);
 9929     BasicType bt = Matcher::vector_element_basic_type(this);
 9930     int opc = this->ideal_Opcode();
 9931     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9932                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9933   %}
 9934   ins_pipe( pipe_slow );
 9935 %}
 9936 
 9937 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9938   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9939   match(Set dst (RotateRightV (Binary dst src2) mask));
 9940   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9941   ins_encode %{
 9942     int vlen_enc = vector_length_encoding(this);
 9943     BasicType bt = Matcher::vector_element_basic_type(this);
 9944     int opc = this->ideal_Opcode();
 9945     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9946                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9947   %}
 9948   ins_pipe( pipe_slow );
 9949 %}
 9950 
 9951 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9952   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9953   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9954   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9955   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9956   ins_encode %{
 9957     int vlen_enc = vector_length_encoding(this);
 9958     BasicType bt = Matcher::vector_element_basic_type(this);
 9959     int opc = this->ideal_Opcode();
 9960     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9961                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9962   %}
 9963   ins_pipe( pipe_slow );
 9964 %}
 9965 
 9966 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9967   predicate(!n->as_ShiftV()->is_var_shift());
 9968   match(Set dst (LShiftVS (Binary dst src2) mask));
 9969   match(Set dst (LShiftVI (Binary dst src2) mask));
 9970   match(Set dst (LShiftVL (Binary dst src2) mask));
 9971   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9972   ins_encode %{
 9973     int vlen_enc = vector_length_encoding(this);
 9974     BasicType bt = Matcher::vector_element_basic_type(this);
 9975     int opc = this->ideal_Opcode();
 9976     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9977                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9978   %}
 9979   ins_pipe( pipe_slow );
 9980 %}
 9981 
 9982 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9983   predicate(n->as_ShiftV()->is_var_shift());
 9984   match(Set dst (LShiftVS (Binary dst src2) mask));
 9985   match(Set dst (LShiftVI (Binary dst src2) mask));
 9986   match(Set dst (LShiftVL (Binary dst src2) mask));
 9987   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9988   ins_encode %{
 9989     int vlen_enc = vector_length_encoding(this);
 9990     BasicType bt = Matcher::vector_element_basic_type(this);
 9991     int opc = this->ideal_Opcode();
 9992     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9993                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9994   %}
 9995   ins_pipe( pipe_slow );
 9996 %}
 9997 
 9998 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9999   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10000   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10001   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10002   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10003   ins_encode %{
10004     int vlen_enc = vector_length_encoding(this);
10005     BasicType bt = Matcher::vector_element_basic_type(this);
10006     int opc = this->ideal_Opcode();
10007     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10008                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10009   %}
10010   ins_pipe( pipe_slow );
10011 %}
10012 
10013 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10014   predicate(!n->as_ShiftV()->is_var_shift());
10015   match(Set dst (RShiftVS (Binary dst src2) mask));
10016   match(Set dst (RShiftVI (Binary dst src2) mask));
10017   match(Set dst (RShiftVL (Binary dst src2) mask));
10018   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10019   ins_encode %{
10020     int vlen_enc = vector_length_encoding(this);
10021     BasicType bt = Matcher::vector_element_basic_type(this);
10022     int opc = this->ideal_Opcode();
10023     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10024                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10025   %}
10026   ins_pipe( pipe_slow );
10027 %}
10028 
10029 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10030   predicate(n->as_ShiftV()->is_var_shift());
10031   match(Set dst (RShiftVS (Binary dst src2) mask));
10032   match(Set dst (RShiftVI (Binary dst src2) mask));
10033   match(Set dst (RShiftVL (Binary dst src2) mask));
10034   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10035   ins_encode %{
10036     int vlen_enc = vector_length_encoding(this);
10037     BasicType bt = Matcher::vector_element_basic_type(this);
10038     int opc = this->ideal_Opcode();
10039     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10040                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10041   %}
10042   ins_pipe( pipe_slow );
10043 %}
10044 
10045 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10046   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10047   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10048   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10049   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10050   ins_encode %{
10051     int vlen_enc = vector_length_encoding(this);
10052     BasicType bt = Matcher::vector_element_basic_type(this);
10053     int opc = this->ideal_Opcode();
10054     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10055                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10056   %}
10057   ins_pipe( pipe_slow );
10058 %}
10059 
10060 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10061   predicate(!n->as_ShiftV()->is_var_shift());
10062   match(Set dst (URShiftVS (Binary dst src2) mask));
10063   match(Set dst (URShiftVI (Binary dst src2) mask));
10064   match(Set dst (URShiftVL (Binary dst src2) mask));
10065   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10066   ins_encode %{
10067     int vlen_enc = vector_length_encoding(this);
10068     BasicType bt = Matcher::vector_element_basic_type(this);
10069     int opc = this->ideal_Opcode();
10070     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10071                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10072   %}
10073   ins_pipe( pipe_slow );
10074 %}
10075 
10076 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10077   predicate(n->as_ShiftV()->is_var_shift());
10078   match(Set dst (URShiftVS (Binary dst src2) mask));
10079   match(Set dst (URShiftVI (Binary dst src2) mask));
10080   match(Set dst (URShiftVL (Binary dst src2) mask));
10081   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10082   ins_encode %{
10083     int vlen_enc = vector_length_encoding(this);
10084     BasicType bt = Matcher::vector_element_basic_type(this);
10085     int opc = this->ideal_Opcode();
10086     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10087                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10088   %}
10089   ins_pipe( pipe_slow );
10090 %}
10091 
10092 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10093   match(Set dst (MaxV (Binary dst src2) mask));
10094   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10095   ins_encode %{
10096     int vlen_enc = vector_length_encoding(this);
10097     BasicType bt = Matcher::vector_element_basic_type(this);
10098     int opc = this->ideal_Opcode();
10099     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10100                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10101   %}
10102   ins_pipe( pipe_slow );
10103 %}
10104 
10105 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10106   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10107   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10108   ins_encode %{
10109     int vlen_enc = vector_length_encoding(this);
10110     BasicType bt = Matcher::vector_element_basic_type(this);
10111     int opc = this->ideal_Opcode();
10112     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10113                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10114   %}
10115   ins_pipe( pipe_slow );
10116 %}
10117 
10118 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10119   match(Set dst (MinV (Binary dst src2) mask));
10120   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10121   ins_encode %{
10122     int vlen_enc = vector_length_encoding(this);
10123     BasicType bt = Matcher::vector_element_basic_type(this);
10124     int opc = this->ideal_Opcode();
10125     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10126                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10127   %}
10128   ins_pipe( pipe_slow );
10129 %}
10130 
10131 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10132   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10133   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10134   ins_encode %{
10135     int vlen_enc = vector_length_encoding(this);
10136     BasicType bt = Matcher::vector_element_basic_type(this);
10137     int opc = this->ideal_Opcode();
10138     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10139                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10140   %}
10141   ins_pipe( pipe_slow );
10142 %}
10143 
10144 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10145   match(Set dst (VectorRearrange (Binary dst src2) mask));
10146   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10147   ins_encode %{
10148     int vlen_enc = vector_length_encoding(this);
10149     BasicType bt = Matcher::vector_element_basic_type(this);
10150     int opc = this->ideal_Opcode();
10151     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10152                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10153   %}
10154   ins_pipe( pipe_slow );
10155 %}
10156 
10157 instruct vabs_masked(vec dst, kReg mask) %{
10158   match(Set dst (AbsVB dst mask));
10159   match(Set dst (AbsVS dst mask));
10160   match(Set dst (AbsVI dst mask));
10161   match(Set dst (AbsVL dst mask));
10162   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10163   ins_encode %{
10164     int vlen_enc = vector_length_encoding(this);
10165     BasicType bt = Matcher::vector_element_basic_type(this);
10166     int opc = this->ideal_Opcode();
10167     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10168                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10169   %}
10170   ins_pipe( pipe_slow );
10171 %}
10172 
10173 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10174   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10175   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10176   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10177   ins_encode %{
10178     assert(UseFMA, "Needs FMA instructions support.");
10179     int vlen_enc = vector_length_encoding(this);
10180     BasicType bt = Matcher::vector_element_basic_type(this);
10181     int opc = this->ideal_Opcode();
10182     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10183                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10184   %}
10185   ins_pipe( pipe_slow );
10186 %}
10187 
10188 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10189   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10190   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10191   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10192   ins_encode %{
10193     assert(UseFMA, "Needs FMA instructions support.");
10194     int vlen_enc = vector_length_encoding(this);
10195     BasicType bt = Matcher::vector_element_basic_type(this);
10196     int opc = this->ideal_Opcode();
10197     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10198                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10199   %}
10200   ins_pipe( pipe_slow );
10201 %}
10202 
10203 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10204   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10205   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10206   ins_encode %{
10207     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10208     int vlen_enc = vector_length_encoding(this, $src1);
10209     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10210 
10211     // Comparison i
10212     switch (src1_elem_bt) {
10213       case T_BYTE: {
10214         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10215         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10216         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10217         break;
10218       }
10219       case T_SHORT: {
10220         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10221         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10222         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10223         break;
10224       }
10225       case T_INT: {
10226         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10227         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10228         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10229         break;
10230       }
10231       case T_LONG: {
10232         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10233         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10234         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10235         break;
10236       }
10237       case T_FLOAT: {
10238         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10239         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10240         break;
10241       }
10242       case T_DOUBLE: {
10243         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10244         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10245         break;
10246       }
10247       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10248     }
10249   %}
10250   ins_pipe( pipe_slow );
10251 %}
10252 
10253 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10254   predicate(Matcher::vector_length(n) <= 32);
10255   match(Set dst (MaskAll src));
10256   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10257   ins_encode %{
10258     int mask_len = Matcher::vector_length(this);
10259     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10260   %}
10261   ins_pipe( pipe_slow );
10262 %}
10263 
10264 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10265   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10266   match(Set dst (XorVMask src (MaskAll cnt)));
10267   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10268   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10269   ins_encode %{
10270     uint masklen = Matcher::vector_length(this);
10271     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10272   %}
10273   ins_pipe( pipe_slow );
10274 %}
10275 
10276 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10277   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10278             (Matcher::vector_length(n) == 16) ||
10279             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10280   match(Set dst (XorVMask src (MaskAll cnt)));
10281   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10282   ins_encode %{
10283     uint masklen = Matcher::vector_length(this);
10284     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10285   %}
10286   ins_pipe( pipe_slow );
10287 %}
10288 
10289 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10290   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10291   match(Set dst (VectorLongToMask src));
10292   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10293   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10294   ins_encode %{
10295     int mask_len = Matcher::vector_length(this);
10296     int vec_enc  = vector_length_encoding(mask_len);
10297     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10298                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10299   %}
10300   ins_pipe( pipe_slow );
10301 %}
10302 
10303 
10304 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10305   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10306   match(Set dst (VectorLongToMask src));
10307   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10308   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10309   ins_encode %{
10310     int mask_len = Matcher::vector_length(this);
10311     assert(mask_len <= 32, "invalid mask length");
10312     int vec_enc  = vector_length_encoding(mask_len);
10313     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10314                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10315   %}
10316   ins_pipe( pipe_slow );
10317 %}
10318 
10319 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10320   predicate(n->bottom_type()->isa_vectmask());
10321   match(Set dst (VectorLongToMask src));
10322   format %{ "long_to_mask_evex $dst, $src\t!" %}
10323   ins_encode %{
10324     __ kmov($dst$$KRegister, $src$$Register);
10325   %}
10326   ins_pipe( pipe_slow );
10327 %}
10328 
10329 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10330   match(Set dst (AndVMask src1 src2));
10331   match(Set dst (OrVMask src1 src2));
10332   match(Set dst (XorVMask src1 src2));
10333   effect(TEMP kscratch);
10334   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10335   ins_encode %{
10336     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10337     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10338     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10339     uint masklen = Matcher::vector_length(this);
10340     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10341     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10342   %}
10343   ins_pipe( pipe_slow );
10344 %}
10345 
10346 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10347   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10348   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10349   ins_encode %{
10350     int vlen_enc = vector_length_encoding(this);
10351     BasicType bt = Matcher::vector_element_basic_type(this);
10352     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10353                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10354   %}
10355   ins_pipe( pipe_slow );
10356 %}
10357 
10358 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10359   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10360   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10361   ins_encode %{
10362     int vlen_enc = vector_length_encoding(this);
10363     BasicType bt = Matcher::vector_element_basic_type(this);
10364     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10365                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10366   %}
10367   ins_pipe( pipe_slow );
10368 %}
10369 
10370 instruct castMM(kReg dst)
10371 %{
10372   match(Set dst (CastVV dst));
10373 
10374   size(0);
10375   format %{ "# castVV of $dst" %}
10376   ins_encode(/* empty encoding */);
10377   ins_cost(0);
10378   ins_pipe(empty);
10379 %}
10380 
10381 instruct castVV(vec dst)
10382 %{
10383   match(Set dst (CastVV dst));
10384 
10385   size(0);
10386   format %{ "# castVV of $dst" %}
10387   ins_encode(/* empty encoding */);
10388   ins_cost(0);
10389   ins_pipe(empty);
10390 %}
10391 
10392 instruct castVVLeg(legVec dst)
10393 %{
10394   match(Set dst (CastVV dst));
10395 
10396   size(0);
10397   format %{ "# castVV of $dst" %}
10398   ins_encode(/* empty encoding */);
10399   ins_cost(0);
10400   ins_pipe(empty);
10401 %}
10402 
10403 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10404 %{
10405   match(Set dst (IsInfiniteF src));
10406   effect(TEMP ktmp, KILL cr);
10407   format %{ "float_class_check $dst, $src" %}
10408   ins_encode %{
10409     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10410     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10411   %}
10412   ins_pipe(pipe_slow);
10413 %}
10414 
10415 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10416 %{
10417   match(Set dst (IsInfiniteD src));
10418   effect(TEMP ktmp, KILL cr);
10419   format %{ "double_class_check $dst, $src" %}
10420   ins_encode %{
10421     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10422     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10423   %}
10424   ins_pipe(pipe_slow);
10425 %}
10426 
10427 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10428 %{
10429   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10430             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10431   match(Set dst (SaturatingAddV src1 src2));
10432   match(Set dst (SaturatingSubV src1 src2));
10433   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10434   ins_encode %{
10435     int vlen_enc = vector_length_encoding(this);
10436     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10437     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10438                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10439   %}
10440   ins_pipe(pipe_slow);
10441 %}
10442 
10443 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10444 %{
10445   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10446             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10447   match(Set dst (SaturatingAddV src1 src2));
10448   match(Set dst (SaturatingSubV src1 src2));
10449   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10450   ins_encode %{
10451     int vlen_enc = vector_length_encoding(this);
10452     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10453     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10454                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10455   %}
10456   ins_pipe(pipe_slow);
10457 %}
10458 
10459 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10460 %{
10461   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10462             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10463             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10464   match(Set dst (SaturatingAddV src1 src2));
10465   match(Set dst (SaturatingSubV src1 src2));
10466   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10467   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10468   ins_encode %{
10469     int vlen_enc = vector_length_encoding(this);
10470     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10471     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10472                                         $src1$$XMMRegister, $src2$$XMMRegister,
10473                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10474                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10475   %}
10476   ins_pipe(pipe_slow);
10477 %}
10478 
10479 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10480 %{
10481   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10482             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10483             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10484   match(Set dst (SaturatingAddV src1 src2));
10485   match(Set dst (SaturatingSubV src1 src2));
10486   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10487   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10488   ins_encode %{
10489     int vlen_enc = vector_length_encoding(this);
10490     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10491     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10492                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10493                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10494   %}
10495   ins_pipe(pipe_slow);
10496 %}
10497 
10498 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10499 %{
10500   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10501             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10502             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10503   match(Set dst (SaturatingAddV src1 src2));
10504   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10505   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10506   ins_encode %{
10507     int vlen_enc = vector_length_encoding(this);
10508     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10509     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10510                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10511   %}
10512   ins_pipe(pipe_slow);
10513 %}
10514 
10515 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10516 %{
10517   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10518             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10519             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10520   match(Set dst (SaturatingAddV src1 src2));
10521   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10522   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10523   ins_encode %{
10524     int vlen_enc = vector_length_encoding(this);
10525     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10526     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10527                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10528   %}
10529   ins_pipe(pipe_slow);
10530 %}
10531 
10532 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10533 %{
10534   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10535             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10536             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10537   match(Set dst (SaturatingSubV src1 src2));
10538   effect(TEMP ktmp);
10539   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10540   ins_encode %{
10541     int vlen_enc = vector_length_encoding(this);
10542     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10543     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10544                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10545   %}
10546   ins_pipe(pipe_slow);
10547 %}
10548 
10549 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10550 %{
10551   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10552             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10553             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10554   match(Set dst (SaturatingSubV src1 src2));
10555   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10556   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10557   ins_encode %{
10558     int vlen_enc = vector_length_encoding(this);
10559     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10560     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10561                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10562   %}
10563   ins_pipe(pipe_slow);
10564 %}
10565 
10566 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10567 %{
10568   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10569             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10570   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10571   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10572   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10573   ins_encode %{
10574     int vlen_enc = vector_length_encoding(this);
10575     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10576     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10577                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10578   %}
10579   ins_pipe(pipe_slow);
10580 %}
10581 
10582 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10583 %{
10584   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10585             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10586   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10587   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10588   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10589   ins_encode %{
10590     int vlen_enc = vector_length_encoding(this);
10591     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10592     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10593                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10594   %}
10595   ins_pipe(pipe_slow);
10596 %}
10597 
10598 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10599   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10600             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10601   match(Set dst (SaturatingAddV (Binary dst src) mask));
10602   match(Set dst (SaturatingSubV (Binary dst src) mask));
10603   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10604   ins_encode %{
10605     int vlen_enc = vector_length_encoding(this);
10606     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10607     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10608                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10609   %}
10610   ins_pipe( pipe_slow );
10611 %}
10612 
10613 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10614   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10615             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10616   match(Set dst (SaturatingAddV (Binary dst src) mask));
10617   match(Set dst (SaturatingSubV (Binary dst src) mask));
10618   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10619   ins_encode %{
10620     int vlen_enc = vector_length_encoding(this);
10621     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10622     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10623                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10624   %}
10625   ins_pipe( pipe_slow );
10626 %}
10627 
10628 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10629   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10630             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10631   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10632   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10633   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10634   ins_encode %{
10635     int vlen_enc = vector_length_encoding(this);
10636     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10637     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10638                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10639   %}
10640   ins_pipe( pipe_slow );
10641 %}
10642 
10643 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10644   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10645             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10646   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10647   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10648   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10649   ins_encode %{
10650     int vlen_enc = vector_length_encoding(this);
10651     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10652     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10653                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10654   %}
10655   ins_pipe( pipe_slow );
10656 %}
10657 
10658 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10659 %{
10660   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10661   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10662   ins_encode %{
10663     int vlen_enc = vector_length_encoding(this);
10664     BasicType bt = Matcher::vector_element_basic_type(this);
10665     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10666   %}
10667   ins_pipe(pipe_slow);
10668 %}
10669 
10670 instruct reinterpretS2HF(regF dst, rRegI src)
10671 %{
10672   match(Set dst (ReinterpretS2HF src));
10673   format %{ "vmovw $dst, $src" %}
10674   ins_encode %{
10675     __ vmovw($dst$$XMMRegister, $src$$Register);
10676   %}
10677   ins_pipe(pipe_slow);
10678 %}
10679 
10680 instruct convF2HFAndS2HF(regF dst, regF src)
10681 %{
10682   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10683   format %{ "convF2HFAndS2HF $dst, $src" %}
10684   ins_encode %{
10685     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10686   %}
10687   ins_pipe(pipe_slow);
10688 %}
10689 
10690 instruct convHF2SAndHF2F(regF dst, regF src)
10691 %{
10692   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10693   format %{ "convHF2SAndHF2F $dst, $src" %}
10694   ins_encode %{
10695     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10696   %}
10697   ins_pipe(pipe_slow);
10698 %}
10699 
10700 instruct reinterpretHF2S(rRegI dst, regF src)
10701 %{
10702   match(Set dst (ReinterpretHF2S src));
10703   format %{ "vmovw $dst, $src" %}
10704   ins_encode %{
10705     __ vmovw($dst$$Register, $src$$XMMRegister);
10706   %}
10707   ins_pipe(pipe_slow);
10708 %}
10709 
10710 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10711 %{
10712   match(Set dst (SqrtHF src));
10713   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10714   ins_encode %{
10715     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10716   %}
10717   ins_pipe(pipe_slow);
10718 %}
10719 
10720 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10721 %{
10722   match(Set dst (AddHF src1 src2));
10723   match(Set dst (DivHF src1 src2));
10724   match(Set dst (MulHF src1 src2));
10725   match(Set dst (SubHF src1 src2));
10726   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10727   ins_encode %{
10728     int opcode = this->ideal_Opcode();
10729     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10730   %}
10731   ins_pipe(pipe_slow);
10732 %}
10733 
10734 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10735 %{
10736   match(Set dst (MaxHF src1 src2));
10737   match(Set dst (MinHF src1 src2));
10738   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10739   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10740   ins_encode %{
10741     int opcode = this->ideal_Opcode();
10742     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10743                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, Assembler::AVX_128bit);
10744   %}
10745   ins_pipe( pipe_slow );
10746 %}
10747 
10748 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10749 %{
10750   match(Set dst (FmaHF  src2 (Binary dst src1)));
10751   effect(DEF dst);
10752   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10753   ins_encode %{
10754     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10755   %}
10756   ins_pipe( pipe_slow );
10757 %}