1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1835          return false;
 1836        }
 1837        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1838          return false;
 1839        }
 1840        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1841          return false;
 1842        }
 1843        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1844          return false;
 1845        }
 1846        break;
 1847     case Op_MaskAll:
 1848       if (!VM_Version::supports_evex()) {
 1849         return false;
 1850       }
 1851       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1852         return false;
 1853       }
 1854       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1855         return false;
 1856       }
 1857       break;
 1858     case Op_VectorMaskCmp:
 1859       if (vlen < 2 || size_in_bits < 32) {
 1860         return false;
 1861       }
 1862       break;
 1863     case Op_CompressM:
 1864       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1865         return false;
 1866       }
 1867       break;
 1868     case Op_CompressV:
 1869     case Op_ExpandV:
 1870       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1871         return false;
 1872       }
 1873       if (size_in_bits < 128 ) {
 1874         return false;
 1875       }
 1876     case Op_VectorLongToMask:
 1877       if (UseAVX < 1) {
 1878         return false;
 1879       }
 1880       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1881         return false;
 1882       }
 1883       break;
 1884     case Op_SignumVD:
 1885     case Op_SignumVF:
 1886       if (UseAVX < 1) {
 1887         return false;
 1888       }
 1889       break;
 1890     case Op_PopCountVI:
 1891     case Op_PopCountVL: {
 1892         if (!is_pop_count_instr_target(bt) &&
 1893             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1894           return false;
 1895         }
 1896       }
 1897       break;
 1898     case Op_ReverseV:
 1899     case Op_ReverseBytesV:
 1900       if (UseAVX < 2) {
 1901         return false;
 1902       }
 1903       break;
 1904     case Op_CountTrailingZerosV:
 1905     case Op_CountLeadingZerosV:
 1906       if (UseAVX < 2) {
 1907         return false;
 1908       }
 1909       break;
 1910   }
 1911   return true;  // Per default match rules are supported.
 1912 }
 1913 
 1914 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1915   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1916   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1917   // of their non-masked counterpart with mask edge being the differentiator.
 1918   // This routine does a strict check on the existence of masked operation patterns
 1919   // by returning a default false value for all the other opcodes apart from the
 1920   // ones whose masked instruction patterns are defined in this file.
 1921   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1922     return false;
 1923   }
 1924 
 1925   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1926   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1927     return false;
 1928   }
 1929   switch(opcode) {
 1930     // Unary masked operations
 1931     case Op_AbsVB:
 1932     case Op_AbsVS:
 1933       if(!VM_Version::supports_avx512bw()) {
 1934         return false;  // Implementation limitation
 1935       }
 1936     case Op_AbsVI:
 1937     case Op_AbsVL:
 1938       return true;
 1939 
 1940     // Ternary masked operations
 1941     case Op_FmaVF:
 1942     case Op_FmaVD:
 1943       return true;
 1944 
 1945     case Op_MacroLogicV:
 1946       if(bt != T_INT && bt != T_LONG) {
 1947         return false;
 1948       }
 1949       return true;
 1950 
 1951     // Binary masked operations
 1952     case Op_AddVB:
 1953     case Op_AddVS:
 1954     case Op_SubVB:
 1955     case Op_SubVS:
 1956     case Op_MulVS:
 1957     case Op_LShiftVS:
 1958     case Op_RShiftVS:
 1959     case Op_URShiftVS:
 1960       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1961       if (!VM_Version::supports_avx512bw()) {
 1962         return false;  // Implementation limitation
 1963       }
 1964       return true;
 1965 
 1966     case Op_MulVL:
 1967       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1968       if (!VM_Version::supports_avx512dq()) {
 1969         return false;  // Implementation limitation
 1970       }
 1971       return true;
 1972 
 1973     case Op_AndV:
 1974     case Op_OrV:
 1975     case Op_XorV:
 1976     case Op_RotateRightV:
 1977     case Op_RotateLeftV:
 1978       if (bt != T_INT && bt != T_LONG) {
 1979         return false; // Implementation limitation
 1980       }
 1981       return true;
 1982 
 1983     case Op_VectorLoadMask:
 1984       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1985       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1986         return false;
 1987       }
 1988       return true;
 1989 
 1990     case Op_AddVI:
 1991     case Op_AddVL:
 1992     case Op_AddVF:
 1993     case Op_AddVD:
 1994     case Op_SubVI:
 1995     case Op_SubVL:
 1996     case Op_SubVF:
 1997     case Op_SubVD:
 1998     case Op_MulVI:
 1999     case Op_MulVF:
 2000     case Op_MulVD:
 2001     case Op_DivVF:
 2002     case Op_DivVD:
 2003     case Op_SqrtVF:
 2004     case Op_SqrtVD:
 2005     case Op_LShiftVI:
 2006     case Op_LShiftVL:
 2007     case Op_RShiftVI:
 2008     case Op_RShiftVL:
 2009     case Op_URShiftVI:
 2010     case Op_URShiftVL:
 2011     case Op_LoadVectorMasked:
 2012     case Op_StoreVectorMasked:
 2013     case Op_LoadVectorGatherMasked:
 2014     case Op_StoreVectorScatterMasked:
 2015       return true;
 2016 
 2017     case Op_UMinV:
 2018     case Op_UMaxV:
 2019       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2020         return false;
 2021       } // fallthrough
 2022     case Op_MaxV:
 2023     case Op_MinV:
 2024       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2025         return false; // Implementation limitation
 2026       }
 2027       if (is_floating_point_type(bt)) {
 2028         return false; // Implementation limitation
 2029       }
 2030       return true;
 2031     case Op_SaturatingAddV:
 2032     case Op_SaturatingSubV:
 2033       if (!is_subword_type(bt)) {
 2034         return false;
 2035       }
 2036       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2037         return false; // Implementation limitation
 2038       }
 2039       return true;
 2040 
 2041     case Op_VectorMaskCmp:
 2042       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2043         return false; // Implementation limitation
 2044       }
 2045       return true;
 2046 
 2047     case Op_VectorRearrange:
 2048       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2049         return false; // Implementation limitation
 2050       }
 2051       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2052         return false; // Implementation limitation
 2053       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2054         return false; // Implementation limitation
 2055       }
 2056       return true;
 2057 
 2058     // Binary Logical operations
 2059     case Op_AndVMask:
 2060     case Op_OrVMask:
 2061     case Op_XorVMask:
 2062       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2063         return false; // Implementation limitation
 2064       }
 2065       return true;
 2066 
 2067     case Op_PopCountVI:
 2068     case Op_PopCountVL:
 2069       if (!is_pop_count_instr_target(bt)) {
 2070         return false;
 2071       }
 2072       return true;
 2073 
 2074     case Op_MaskAll:
 2075       return true;
 2076 
 2077     case Op_CountLeadingZerosV:
 2078       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2079         return true;
 2080       }
 2081     default:
 2082       return false;
 2083   }
 2084 }
 2085 
 2086 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2087   return false;
 2088 }
 2089 
 2090 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2091 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2092   switch (elem_bt) {
 2093     case T_BYTE:  return false;
 2094     case T_SHORT: return !VM_Version::supports_avx512bw();
 2095     case T_INT:   return !VM_Version::supports_avx();
 2096     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2097     default:
 2098       ShouldNotReachHere();
 2099       return false;
 2100   }
 2101 }
 2102 
 2103 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2104   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2105   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2106   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2107       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2108     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2109     return new legVecZOper();
 2110   }
 2111   if (legacy) {
 2112     switch (ideal_reg) {
 2113       case Op_VecS: return new legVecSOper();
 2114       case Op_VecD: return new legVecDOper();
 2115       case Op_VecX: return new legVecXOper();
 2116       case Op_VecY: return new legVecYOper();
 2117       case Op_VecZ: return new legVecZOper();
 2118     }
 2119   } else {
 2120     switch (ideal_reg) {
 2121       case Op_VecS: return new vecSOper();
 2122       case Op_VecD: return new vecDOper();
 2123       case Op_VecX: return new vecXOper();
 2124       case Op_VecY: return new vecYOper();
 2125       case Op_VecZ: return new vecZOper();
 2126     }
 2127   }
 2128   ShouldNotReachHere();
 2129   return nullptr;
 2130 }
 2131 
 2132 bool Matcher::is_reg2reg_move(MachNode* m) {
 2133   switch (m->rule()) {
 2134     case MoveVec2Leg_rule:
 2135     case MoveLeg2Vec_rule:
 2136     case MoveF2VL_rule:
 2137     case MoveF2LEG_rule:
 2138     case MoveVL2F_rule:
 2139     case MoveLEG2F_rule:
 2140     case MoveD2VL_rule:
 2141     case MoveD2LEG_rule:
 2142     case MoveVL2D_rule:
 2143     case MoveLEG2D_rule:
 2144       return true;
 2145     default:
 2146       return false;
 2147   }
 2148 }
 2149 
 2150 bool Matcher::is_generic_vector(MachOper* opnd) {
 2151   switch (opnd->opcode()) {
 2152     case VEC:
 2153     case LEGVEC:
 2154       return true;
 2155     default:
 2156       return false;
 2157   }
 2158 }
 2159 
 2160 //------------------------------------------------------------------------
 2161 
 2162 const RegMask* Matcher::predicate_reg_mask(void) {
 2163   return &_VECTMASK_REG_mask;
 2164 }
 2165 
 2166 // Max vector size in bytes. 0 if not supported.
 2167 int Matcher::vector_width_in_bytes(BasicType bt) {
 2168   assert(is_java_primitive(bt), "only primitive type vectors");
 2169   // SSE2 supports 128bit vectors for all types.
 2170   // AVX2 supports 256bit vectors for all types.
 2171   // AVX2/EVEX supports 512bit vectors for all types.
 2172   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2173   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2174   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2175     size = (UseAVX > 2) ? 64 : 32;
 2176   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2177     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2178   // Use flag to limit vector size.
 2179   size = MIN2(size,(int)MaxVectorSize);
 2180   // Minimum 2 values in vector (or 4 for bytes).
 2181   switch (bt) {
 2182   case T_DOUBLE:
 2183   case T_LONG:
 2184     if (size < 16) return 0;
 2185     break;
 2186   case T_FLOAT:
 2187   case T_INT:
 2188     if (size < 8) return 0;
 2189     break;
 2190   case T_BOOLEAN:
 2191     if (size < 4) return 0;
 2192     break;
 2193   case T_CHAR:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_BYTE:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_SHORT:
 2200     if (size < 4) return 0;
 2201     break;
 2202   default:
 2203     ShouldNotReachHere();
 2204   }
 2205   return size;
 2206 }
 2207 
 2208 // Limits on vector size (number of elements) loaded into vector.
 2209 int Matcher::max_vector_size(const BasicType bt) {
 2210   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2211 }
 2212 int Matcher::min_vector_size(const BasicType bt) {
 2213   int max_size = max_vector_size(bt);
 2214   // Min size which can be loaded into vector is 4 bytes.
 2215   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2216   // Support for calling svml double64 vectors
 2217   if (bt == T_DOUBLE) {
 2218     size = 1;
 2219   }
 2220   return MIN2(size,max_size);
 2221 }
 2222 
 2223 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2224   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2225   // by default on Cascade Lake
 2226   if (VM_Version::is_default_intel_cascade_lake()) {
 2227     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2228   }
 2229   return Matcher::max_vector_size(bt);
 2230 }
 2231 
 2232 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2233   return -1;
 2234 }
 2235 
 2236 // Vector ideal reg corresponding to specified size in bytes
 2237 uint Matcher::vector_ideal_reg(int size) {
 2238   assert(MaxVectorSize >= size, "");
 2239   switch(size) {
 2240     case  4: return Op_VecS;
 2241     case  8: return Op_VecD;
 2242     case 16: return Op_VecX;
 2243     case 32: return Op_VecY;
 2244     case 64: return Op_VecZ;
 2245   }
 2246   ShouldNotReachHere();
 2247   return 0;
 2248 }
 2249 
 2250 // Check for shift by small constant as well
 2251 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2252   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2253       shift->in(2)->get_int() <= 3 &&
 2254       // Are there other uses besides address expressions?
 2255       !matcher->is_visited(shift)) {
 2256     address_visited.set(shift->_idx); // Flag as address_visited
 2257     mstack.push(shift->in(2), Matcher::Visit);
 2258     Node *conv = shift->in(1);
 2259     // Allow Matcher to match the rule which bypass
 2260     // ConvI2L operation for an array index on LP64
 2261     // if the index value is positive.
 2262     if (conv->Opcode() == Op_ConvI2L &&
 2263         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2264         // Are there other uses besides address expressions?
 2265         !matcher->is_visited(conv)) {
 2266       address_visited.set(conv->_idx); // Flag as address_visited
 2267       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2268     } else {
 2269       mstack.push(conv, Matcher::Pre_Visit);
 2270     }
 2271     return true;
 2272   }
 2273   return false;
 2274 }
 2275 
 2276 // This function identifies sub-graphs in which a 'load' node is
 2277 // input to two different nodes, and such that it can be matched
 2278 // with BMI instructions like blsi, blsr, etc.
 2279 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2280 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2281 // refers to the same node.
 2282 //
 2283 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2284 // This is a temporary solution until we make DAGs expressible in ADL.
 2285 template<typename ConType>
 2286 class FusedPatternMatcher {
 2287   Node* _op1_node;
 2288   Node* _mop_node;
 2289   int _con_op;
 2290 
 2291   static int match_next(Node* n, int next_op, int next_op_idx) {
 2292     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2293       return -1;
 2294     }
 2295 
 2296     if (next_op_idx == -1) { // n is commutative, try rotations
 2297       if (n->in(1)->Opcode() == next_op) {
 2298         return 1;
 2299       } else if (n->in(2)->Opcode() == next_op) {
 2300         return 2;
 2301       }
 2302     } else {
 2303       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2304       if (n->in(next_op_idx)->Opcode() == next_op) {
 2305         return next_op_idx;
 2306       }
 2307     }
 2308     return -1;
 2309   }
 2310 
 2311  public:
 2312   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2313     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2314 
 2315   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2316              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2317              typename ConType::NativeType con_value) {
 2318     if (_op1_node->Opcode() != op1) {
 2319       return false;
 2320     }
 2321     if (_mop_node->outcnt() > 2) {
 2322       return false;
 2323     }
 2324     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2325     if (op1_op2_idx == -1) {
 2326       return false;
 2327     }
 2328     // Memory operation must be the other edge
 2329     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2330 
 2331     // Check that the mop node is really what we want
 2332     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2333       Node* op2_node = _op1_node->in(op1_op2_idx);
 2334       if (op2_node->outcnt() > 1) {
 2335         return false;
 2336       }
 2337       assert(op2_node->Opcode() == op2, "Should be");
 2338       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2339       if (op2_con_idx == -1) {
 2340         return false;
 2341       }
 2342       // Memory operation must be the other edge
 2343       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2344       // Check that the memory operation is the same node
 2345       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2346         // Now check the constant
 2347         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2348         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2349           return true;
 2350         }
 2351       }
 2352     }
 2353     return false;
 2354   }
 2355 };
 2356 
 2357 static bool is_bmi_pattern(Node* n, Node* m) {
 2358   assert(UseBMI1Instructions, "sanity");
 2359   if (n != nullptr && m != nullptr) {
 2360     if (m->Opcode() == Op_LoadI) {
 2361       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2362       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2363              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2364              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2365     } else if (m->Opcode() == Op_LoadL) {
 2366       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2367       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2368              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2369              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2370     }
 2371   }
 2372   return false;
 2373 }
 2374 
 2375 // Should the matcher clone input 'm' of node 'n'?
 2376 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2377   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2378   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2379     mstack.push(m, Visit);
 2380     return true;
 2381   }
 2382   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2383     mstack.push(m, Visit);           // m = ShiftCntV
 2384     return true;
 2385   }
 2386   if (is_encode_and_store_pattern(n, m)) {
 2387     mstack.push(m, Visit);
 2388     return true;
 2389   }
 2390   return false;
 2391 }
 2392 
 2393 // Should the Matcher clone shifts on addressing modes, expecting them
 2394 // to be subsumed into complex addressing expressions or compute them
 2395 // into registers?
 2396 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2397   Node *off = m->in(AddPNode::Offset);
 2398   if (off->is_Con()) {
 2399     address_visited.test_set(m->_idx); // Flag as address_visited
 2400     Node *adr = m->in(AddPNode::Address);
 2401 
 2402     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2403     // AtomicAdd is not an addressing expression.
 2404     // Cheap to find it by looking for screwy base.
 2405     if (adr->is_AddP() &&
 2406         !adr->in(AddPNode::Base)->is_top() &&
 2407         !adr->in(AddPNode::Offset)->is_Con() &&
 2408         off->get_long() == (int) (off->get_long()) && // immL32
 2409         // Are there other uses besides address expressions?
 2410         !is_visited(adr)) {
 2411       address_visited.set(adr->_idx); // Flag as address_visited
 2412       Node *shift = adr->in(AddPNode::Offset);
 2413       if (!clone_shift(shift, this, mstack, address_visited)) {
 2414         mstack.push(shift, Pre_Visit);
 2415       }
 2416       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2417       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2418     } else {
 2419       mstack.push(adr, Pre_Visit);
 2420     }
 2421 
 2422     // Clone X+offset as it also folds into most addressing expressions
 2423     mstack.push(off, Visit);
 2424     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2425     return true;
 2426   } else if (clone_shift(off, this, mstack, address_visited)) {
 2427     address_visited.test_set(m->_idx); // Flag as address_visited
 2428     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2429     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2430     return true;
 2431   }
 2432   return false;
 2433 }
 2434 
 2435 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2436   switch (bt) {
 2437     case BoolTest::eq:
 2438       return Assembler::eq;
 2439     case BoolTest::ne:
 2440       return Assembler::neq;
 2441     case BoolTest::le:
 2442     case BoolTest::ule:
 2443       return Assembler::le;
 2444     case BoolTest::ge:
 2445     case BoolTest::uge:
 2446       return Assembler::nlt;
 2447     case BoolTest::lt:
 2448     case BoolTest::ult:
 2449       return Assembler::lt;
 2450     case BoolTest::gt:
 2451     case BoolTest::ugt:
 2452       return Assembler::nle;
 2453     default : ShouldNotReachHere(); return Assembler::_false;
 2454   }
 2455 }
 2456 
 2457 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2458   switch (bt) {
 2459   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2460   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2461   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2462   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2463   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2464   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2465   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2466   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2467   }
 2468 }
 2469 
 2470 // Helper methods for MachSpillCopyNode::implementation().
 2471 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2472                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2473   assert(ireg == Op_VecS || // 32bit vector
 2474          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2475           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2476          "no non-adjacent vector moves" );
 2477   if (masm) {
 2478     switch (ireg) {
 2479     case Op_VecS: // copy whole register
 2480     case Op_VecD:
 2481     case Op_VecX:
 2482       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2483         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2484       } else {
 2485         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2486      }
 2487       break;
 2488     case Op_VecY:
 2489       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2490         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2491       } else {
 2492         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2493      }
 2494       break;
 2495     case Op_VecZ:
 2496       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2497       break;
 2498     default:
 2499       ShouldNotReachHere();
 2500     }
 2501 #ifndef PRODUCT
 2502   } else {
 2503     switch (ireg) {
 2504     case Op_VecS:
 2505     case Op_VecD:
 2506     case Op_VecX:
 2507       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2508       break;
 2509     case Op_VecY:
 2510     case Op_VecZ:
 2511       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2512       break;
 2513     default:
 2514       ShouldNotReachHere();
 2515     }
 2516 #endif
 2517   }
 2518 }
 2519 
 2520 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2521                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2522   if (masm) {
 2523     if (is_load) {
 2524       switch (ireg) {
 2525       case Op_VecS:
 2526         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2527         break;
 2528       case Op_VecD:
 2529         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecX:
 2532         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2533           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2534         } else {
 2535           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2536           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2537         }
 2538         break;
 2539       case Op_VecY:
 2540         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2541           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2542         } else {
 2543           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2544           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2545         }
 2546         break;
 2547       case Op_VecZ:
 2548         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2549         break;
 2550       default:
 2551         ShouldNotReachHere();
 2552       }
 2553     } else { // store
 2554       switch (ireg) {
 2555       case Op_VecS:
 2556         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2557         break;
 2558       case Op_VecD:
 2559         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecX:
 2562         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2563           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2564         }
 2565         else {
 2566           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2567         }
 2568         break;
 2569       case Op_VecY:
 2570         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2571           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2572         }
 2573         else {
 2574           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2575         }
 2576         break;
 2577       case Op_VecZ:
 2578         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2579         break;
 2580       default:
 2581         ShouldNotReachHere();
 2582       }
 2583     }
 2584 #ifndef PRODUCT
 2585   } else {
 2586     if (is_load) {
 2587       switch (ireg) {
 2588       case Op_VecS:
 2589         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2590         break;
 2591       case Op_VecD:
 2592         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594        case Op_VecX:
 2595         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597       case Op_VecY:
 2598       case Op_VecZ:
 2599         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2600         break;
 2601       default:
 2602         ShouldNotReachHere();
 2603       }
 2604     } else { // store
 2605       switch (ireg) {
 2606       case Op_VecS:
 2607         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2608         break;
 2609       case Op_VecD:
 2610         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612        case Op_VecX:
 2613         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615       case Op_VecY:
 2616       case Op_VecZ:
 2617         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2618         break;
 2619       default:
 2620         ShouldNotReachHere();
 2621       }
 2622     }
 2623 #endif
 2624   }
 2625 }
 2626 
 2627 template <class T>
 2628 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2629   int size = type2aelembytes(bt) * len;
 2630   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2631   for (int i = 0; i < len; i++) {
 2632     int offset = i * type2aelembytes(bt);
 2633     switch (bt) {
 2634       case T_BYTE: val->at(i) = con; break;
 2635       case T_SHORT: {
 2636         jshort c = con;
 2637         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2638         break;
 2639       }
 2640       case T_INT: {
 2641         jint c = con;
 2642         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2643         break;
 2644       }
 2645       case T_LONG: {
 2646         jlong c = con;
 2647         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2648         break;
 2649       }
 2650       case T_FLOAT: {
 2651         jfloat c = con;
 2652         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2653         break;
 2654       }
 2655       case T_DOUBLE: {
 2656         jdouble c = con;
 2657         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2658         break;
 2659       }
 2660       default: assert(false, "%s", type2name(bt));
 2661     }
 2662   }
 2663   return val;
 2664 }
 2665 
 2666 static inline jlong high_bit_set(BasicType bt) {
 2667   switch (bt) {
 2668     case T_BYTE:  return 0x8080808080808080;
 2669     case T_SHORT: return 0x8000800080008000;
 2670     case T_INT:   return 0x8000000080000000;
 2671     case T_LONG:  return 0x8000000000000000;
 2672     default:
 2673       ShouldNotReachHere();
 2674       return 0;
 2675   }
 2676 }
 2677 
 2678 #ifndef PRODUCT
 2679   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2680     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2681   }
 2682 #endif
 2683 
 2684   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2685     __ nop(_count);
 2686   }
 2687 
 2688   uint MachNopNode::size(PhaseRegAlloc*) const {
 2689     return _count;
 2690   }
 2691 
 2692 #ifndef PRODUCT
 2693   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2694     st->print("# breakpoint");
 2695   }
 2696 #endif
 2697 
 2698   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2699     __ int3();
 2700   }
 2701 
 2702   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2703     return MachNode::size(ra_);
 2704   }
 2705 
 2706 %}
 2707 
 2708 encode %{
 2709 
 2710   enc_class call_epilog %{
 2711     if (VerifyStackAtCalls) {
 2712       // Check that stack depth is unchanged: find majik cookie on stack
 2713       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2714       Label L;
 2715       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2716       __ jccb(Assembler::equal, L);
 2717       // Die if stack mismatch
 2718       __ int3();
 2719       __ bind(L);
 2720     }
 2721     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2722       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2723       // Search for the corresponding projection, get the register and emit code that initialized it.
 2724       uint con = (tf()->range_cc()->cnt() - 1);
 2725       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2726         ProjNode* proj = fast_out(i)->as_Proj();
 2727         if (proj->_con == con) {
 2728           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2729           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2730           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2731           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2732           __ testq(rax, rax);
 2733           __ setb(Assembler::notZero, toReg);
 2734           __ movzbl(toReg, toReg);
 2735           if (reg->is_stack()) {
 2736             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2737             __ movq(Address(rsp, st_off), toReg);
 2738           }
 2739           break;
 2740         }
 2741       }
 2742       if (return_value_is_used()) {
 2743         // An inline type is returned as fields in multiple registers.
 2744         // Rax either contains an oop if the inline type is buffered or a pointer
 2745         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2746         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2747         // rax &= (rax & 1) - 1
 2748         __ movptr(rscratch1, rax);
 2749         __ andptr(rscratch1, 0x1);
 2750         __ subptr(rscratch1, 0x1);
 2751         __ andptr(rax, rscratch1);
 2752       }
 2753     }
 2754   %}
 2755 
 2756 %}
 2757 
 2758 // Operands for bound floating pointer register arguments
 2759 operand rxmm0() %{
 2760   constraint(ALLOC_IN_RC(xmm0_reg));
 2761   match(VecX);
 2762   format%{%}
 2763   interface(REG_INTER);
 2764 %}
 2765 
 2766 //----------OPERANDS-----------------------------------------------------------
 2767 // Operand definitions must precede instruction definitions for correct parsing
 2768 // in the ADLC because operands constitute user defined types which are used in
 2769 // instruction definitions.
 2770 
 2771 // Vectors
 2772 
 2773 // Dummy generic vector class. Should be used for all vector operands.
 2774 // Replaced with vec[SDXYZ] during post-selection pass.
 2775 operand vec() %{
 2776   constraint(ALLOC_IN_RC(dynamic));
 2777   match(VecX);
 2778   match(VecY);
 2779   match(VecZ);
 2780   match(VecS);
 2781   match(VecD);
 2782 
 2783   format %{ %}
 2784   interface(REG_INTER);
 2785 %}
 2786 
 2787 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2788 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2789 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2790 // runtime code generation via reg_class_dynamic.
 2791 operand legVec() %{
 2792   constraint(ALLOC_IN_RC(dynamic));
 2793   match(VecX);
 2794   match(VecY);
 2795   match(VecZ);
 2796   match(VecS);
 2797   match(VecD);
 2798 
 2799   format %{ %}
 2800   interface(REG_INTER);
 2801 %}
 2802 
 2803 // Replaces vec during post-selection cleanup. See above.
 2804 operand vecS() %{
 2805   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2806   match(VecS);
 2807 
 2808   format %{ %}
 2809   interface(REG_INTER);
 2810 %}
 2811 
 2812 // Replaces legVec during post-selection cleanup. See above.
 2813 operand legVecS() %{
 2814   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2815   match(VecS);
 2816 
 2817   format %{ %}
 2818   interface(REG_INTER);
 2819 %}
 2820 
 2821 // Replaces vec during post-selection cleanup. See above.
 2822 operand vecD() %{
 2823   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2824   match(VecD);
 2825 
 2826   format %{ %}
 2827   interface(REG_INTER);
 2828 %}
 2829 
 2830 // Replaces legVec during post-selection cleanup. See above.
 2831 operand legVecD() %{
 2832   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2833   match(VecD);
 2834 
 2835   format %{ %}
 2836   interface(REG_INTER);
 2837 %}
 2838 
 2839 // Replaces vec during post-selection cleanup. See above.
 2840 operand vecX() %{
 2841   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2842   match(VecX);
 2843 
 2844   format %{ %}
 2845   interface(REG_INTER);
 2846 %}
 2847 
 2848 // Replaces legVec during post-selection cleanup. See above.
 2849 operand legVecX() %{
 2850   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2851   match(VecX);
 2852 
 2853   format %{ %}
 2854   interface(REG_INTER);
 2855 %}
 2856 
 2857 // Replaces vec during post-selection cleanup. See above.
 2858 operand vecY() %{
 2859   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2860   match(VecY);
 2861 
 2862   format %{ %}
 2863   interface(REG_INTER);
 2864 %}
 2865 
 2866 // Replaces legVec during post-selection cleanup. See above.
 2867 operand legVecY() %{
 2868   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2869   match(VecY);
 2870 
 2871   format %{ %}
 2872   interface(REG_INTER);
 2873 %}
 2874 
 2875 // Replaces vec during post-selection cleanup. See above.
 2876 operand vecZ() %{
 2877   constraint(ALLOC_IN_RC(vectorz_reg));
 2878   match(VecZ);
 2879 
 2880   format %{ %}
 2881   interface(REG_INTER);
 2882 %}
 2883 
 2884 // Replaces legVec during post-selection cleanup. See above.
 2885 operand legVecZ() %{
 2886   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2887   match(VecZ);
 2888 
 2889   format %{ %}
 2890   interface(REG_INTER);
 2891 %}
 2892 
 2893 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2894 
 2895 // ============================================================================
 2896 
 2897 instruct ShouldNotReachHere() %{
 2898   match(Halt);
 2899   format %{ "stop\t# ShouldNotReachHere" %}
 2900   ins_encode %{
 2901     if (is_reachable()) {
 2902       const char* str = __ code_string(_halt_reason);
 2903       __ stop(str);
 2904     }
 2905   %}
 2906   ins_pipe(pipe_slow);
 2907 %}
 2908 
 2909 // ============================================================================
 2910 
 2911 instruct addF_reg(regF dst, regF src) %{
 2912   predicate(UseAVX == 0);
 2913   match(Set dst (AddF dst src));
 2914 
 2915   format %{ "addss   $dst, $src" %}
 2916   ins_cost(150);
 2917   ins_encode %{
 2918     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2919   %}
 2920   ins_pipe(pipe_slow);
 2921 %}
 2922 
 2923 instruct addF_mem(regF dst, memory src) %{
 2924   predicate(UseAVX == 0);
 2925   match(Set dst (AddF dst (LoadF src)));
 2926 
 2927   format %{ "addss   $dst, $src" %}
 2928   ins_cost(150);
 2929   ins_encode %{
 2930     __ addss($dst$$XMMRegister, $src$$Address);
 2931   %}
 2932   ins_pipe(pipe_slow);
 2933 %}
 2934 
 2935 instruct addF_imm(regF dst, immF con) %{
 2936   predicate(UseAVX == 0);
 2937   match(Set dst (AddF dst con));
 2938   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2939   ins_cost(150);
 2940   ins_encode %{
 2941     __ addss($dst$$XMMRegister, $constantaddress($con));
 2942   %}
 2943   ins_pipe(pipe_slow);
 2944 %}
 2945 
 2946 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2947   predicate(UseAVX > 0);
 2948   match(Set dst (AddF src1 src2));
 2949 
 2950   format %{ "vaddss  $dst, $src1, $src2" %}
 2951   ins_cost(150);
 2952   ins_encode %{
 2953     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2954   %}
 2955   ins_pipe(pipe_slow);
 2956 %}
 2957 
 2958 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2959   predicate(UseAVX > 0);
 2960   match(Set dst (AddF src1 (LoadF src2)));
 2961 
 2962   format %{ "vaddss  $dst, $src1, $src2" %}
 2963   ins_cost(150);
 2964   ins_encode %{
 2965     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2966   %}
 2967   ins_pipe(pipe_slow);
 2968 %}
 2969 
 2970 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2971   predicate(UseAVX > 0);
 2972   match(Set dst (AddF src con));
 2973 
 2974   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2975   ins_cost(150);
 2976   ins_encode %{
 2977     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2978   %}
 2979   ins_pipe(pipe_slow);
 2980 %}
 2981 
 2982 instruct addD_reg(regD dst, regD src) %{
 2983   predicate(UseAVX == 0);
 2984   match(Set dst (AddD dst src));
 2985 
 2986   format %{ "addsd   $dst, $src" %}
 2987   ins_cost(150);
 2988   ins_encode %{
 2989     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2990   %}
 2991   ins_pipe(pipe_slow);
 2992 %}
 2993 
 2994 instruct addD_mem(regD dst, memory src) %{
 2995   predicate(UseAVX == 0);
 2996   match(Set dst (AddD dst (LoadD src)));
 2997 
 2998   format %{ "addsd   $dst, $src" %}
 2999   ins_cost(150);
 3000   ins_encode %{
 3001     __ addsd($dst$$XMMRegister, $src$$Address);
 3002   %}
 3003   ins_pipe(pipe_slow);
 3004 %}
 3005 
 3006 instruct addD_imm(regD dst, immD con) %{
 3007   predicate(UseAVX == 0);
 3008   match(Set dst (AddD dst con));
 3009   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3010   ins_cost(150);
 3011   ins_encode %{
 3012     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3013   %}
 3014   ins_pipe(pipe_slow);
 3015 %}
 3016 
 3017 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3018   predicate(UseAVX > 0);
 3019   match(Set dst (AddD src1 src2));
 3020 
 3021   format %{ "vaddsd  $dst, $src1, $src2" %}
 3022   ins_cost(150);
 3023   ins_encode %{
 3024     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3025   %}
 3026   ins_pipe(pipe_slow);
 3027 %}
 3028 
 3029 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3030   predicate(UseAVX > 0);
 3031   match(Set dst (AddD src1 (LoadD src2)));
 3032 
 3033   format %{ "vaddsd  $dst, $src1, $src2" %}
 3034   ins_cost(150);
 3035   ins_encode %{
 3036     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3037   %}
 3038   ins_pipe(pipe_slow);
 3039 %}
 3040 
 3041 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3042   predicate(UseAVX > 0);
 3043   match(Set dst (AddD src con));
 3044 
 3045   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3046   ins_cost(150);
 3047   ins_encode %{
 3048     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3049   %}
 3050   ins_pipe(pipe_slow);
 3051 %}
 3052 
 3053 instruct subF_reg(regF dst, regF src) %{
 3054   predicate(UseAVX == 0);
 3055   match(Set dst (SubF dst src));
 3056 
 3057   format %{ "subss   $dst, $src" %}
 3058   ins_cost(150);
 3059   ins_encode %{
 3060     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3061   %}
 3062   ins_pipe(pipe_slow);
 3063 %}
 3064 
 3065 instruct subF_mem(regF dst, memory src) %{
 3066   predicate(UseAVX == 0);
 3067   match(Set dst (SubF dst (LoadF src)));
 3068 
 3069   format %{ "subss   $dst, $src" %}
 3070   ins_cost(150);
 3071   ins_encode %{
 3072     __ subss($dst$$XMMRegister, $src$$Address);
 3073   %}
 3074   ins_pipe(pipe_slow);
 3075 %}
 3076 
 3077 instruct subF_imm(regF dst, immF con) %{
 3078   predicate(UseAVX == 0);
 3079   match(Set dst (SubF dst con));
 3080   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3081   ins_cost(150);
 3082   ins_encode %{
 3083     __ subss($dst$$XMMRegister, $constantaddress($con));
 3084   %}
 3085   ins_pipe(pipe_slow);
 3086 %}
 3087 
 3088 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3089   predicate(UseAVX > 0);
 3090   match(Set dst (SubF src1 src2));
 3091 
 3092   format %{ "vsubss  $dst, $src1, $src2" %}
 3093   ins_cost(150);
 3094   ins_encode %{
 3095     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3096   %}
 3097   ins_pipe(pipe_slow);
 3098 %}
 3099 
 3100 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3101   predicate(UseAVX > 0);
 3102   match(Set dst (SubF src1 (LoadF src2)));
 3103 
 3104   format %{ "vsubss  $dst, $src1, $src2" %}
 3105   ins_cost(150);
 3106   ins_encode %{
 3107     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3108   %}
 3109   ins_pipe(pipe_slow);
 3110 %}
 3111 
 3112 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3113   predicate(UseAVX > 0);
 3114   match(Set dst (SubF src con));
 3115 
 3116   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3117   ins_cost(150);
 3118   ins_encode %{
 3119     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3120   %}
 3121   ins_pipe(pipe_slow);
 3122 %}
 3123 
 3124 instruct subD_reg(regD dst, regD src) %{
 3125   predicate(UseAVX == 0);
 3126   match(Set dst (SubD dst src));
 3127 
 3128   format %{ "subsd   $dst, $src" %}
 3129   ins_cost(150);
 3130   ins_encode %{
 3131     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3132   %}
 3133   ins_pipe(pipe_slow);
 3134 %}
 3135 
 3136 instruct subD_mem(regD dst, memory src) %{
 3137   predicate(UseAVX == 0);
 3138   match(Set dst (SubD dst (LoadD src)));
 3139 
 3140   format %{ "subsd   $dst, $src" %}
 3141   ins_cost(150);
 3142   ins_encode %{
 3143     __ subsd($dst$$XMMRegister, $src$$Address);
 3144   %}
 3145   ins_pipe(pipe_slow);
 3146 %}
 3147 
 3148 instruct subD_imm(regD dst, immD con) %{
 3149   predicate(UseAVX == 0);
 3150   match(Set dst (SubD dst con));
 3151   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3152   ins_cost(150);
 3153   ins_encode %{
 3154     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3155   %}
 3156   ins_pipe(pipe_slow);
 3157 %}
 3158 
 3159 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3160   predicate(UseAVX > 0);
 3161   match(Set dst (SubD src1 src2));
 3162 
 3163   format %{ "vsubsd  $dst, $src1, $src2" %}
 3164   ins_cost(150);
 3165   ins_encode %{
 3166     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3167   %}
 3168   ins_pipe(pipe_slow);
 3169 %}
 3170 
 3171 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3172   predicate(UseAVX > 0);
 3173   match(Set dst (SubD src1 (LoadD src2)));
 3174 
 3175   format %{ "vsubsd  $dst, $src1, $src2" %}
 3176   ins_cost(150);
 3177   ins_encode %{
 3178     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3179   %}
 3180   ins_pipe(pipe_slow);
 3181 %}
 3182 
 3183 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3184   predicate(UseAVX > 0);
 3185   match(Set dst (SubD src con));
 3186 
 3187   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3188   ins_cost(150);
 3189   ins_encode %{
 3190     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3191   %}
 3192   ins_pipe(pipe_slow);
 3193 %}
 3194 
 3195 instruct mulF_reg(regF dst, regF src) %{
 3196   predicate(UseAVX == 0);
 3197   match(Set dst (MulF dst src));
 3198 
 3199   format %{ "mulss   $dst, $src" %}
 3200   ins_cost(150);
 3201   ins_encode %{
 3202     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3203   %}
 3204   ins_pipe(pipe_slow);
 3205 %}
 3206 
 3207 instruct mulF_mem(regF dst, memory src) %{
 3208   predicate(UseAVX == 0);
 3209   match(Set dst (MulF dst (LoadF src)));
 3210 
 3211   format %{ "mulss   $dst, $src" %}
 3212   ins_cost(150);
 3213   ins_encode %{
 3214     __ mulss($dst$$XMMRegister, $src$$Address);
 3215   %}
 3216   ins_pipe(pipe_slow);
 3217 %}
 3218 
 3219 instruct mulF_imm(regF dst, immF con) %{
 3220   predicate(UseAVX == 0);
 3221   match(Set dst (MulF dst con));
 3222   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3223   ins_cost(150);
 3224   ins_encode %{
 3225     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3226   %}
 3227   ins_pipe(pipe_slow);
 3228 %}
 3229 
 3230 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3231   predicate(UseAVX > 0);
 3232   match(Set dst (MulF src1 src2));
 3233 
 3234   format %{ "vmulss  $dst, $src1, $src2" %}
 3235   ins_cost(150);
 3236   ins_encode %{
 3237     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3238   %}
 3239   ins_pipe(pipe_slow);
 3240 %}
 3241 
 3242 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3243   predicate(UseAVX > 0);
 3244   match(Set dst (MulF src1 (LoadF src2)));
 3245 
 3246   format %{ "vmulss  $dst, $src1, $src2" %}
 3247   ins_cost(150);
 3248   ins_encode %{
 3249     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3250   %}
 3251   ins_pipe(pipe_slow);
 3252 %}
 3253 
 3254 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3255   predicate(UseAVX > 0);
 3256   match(Set dst (MulF src con));
 3257 
 3258   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3259   ins_cost(150);
 3260   ins_encode %{
 3261     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3262   %}
 3263   ins_pipe(pipe_slow);
 3264 %}
 3265 
 3266 instruct mulD_reg(regD dst, regD src) %{
 3267   predicate(UseAVX == 0);
 3268   match(Set dst (MulD dst src));
 3269 
 3270   format %{ "mulsd   $dst, $src" %}
 3271   ins_cost(150);
 3272   ins_encode %{
 3273     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3274   %}
 3275   ins_pipe(pipe_slow);
 3276 %}
 3277 
 3278 instruct mulD_mem(regD dst, memory src) %{
 3279   predicate(UseAVX == 0);
 3280   match(Set dst (MulD dst (LoadD src)));
 3281 
 3282   format %{ "mulsd   $dst, $src" %}
 3283   ins_cost(150);
 3284   ins_encode %{
 3285     __ mulsd($dst$$XMMRegister, $src$$Address);
 3286   %}
 3287   ins_pipe(pipe_slow);
 3288 %}
 3289 
 3290 instruct mulD_imm(regD dst, immD con) %{
 3291   predicate(UseAVX == 0);
 3292   match(Set dst (MulD dst con));
 3293   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3294   ins_cost(150);
 3295   ins_encode %{
 3296     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3297   %}
 3298   ins_pipe(pipe_slow);
 3299 %}
 3300 
 3301 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3302   predicate(UseAVX > 0);
 3303   match(Set dst (MulD src1 src2));
 3304 
 3305   format %{ "vmulsd  $dst, $src1, $src2" %}
 3306   ins_cost(150);
 3307   ins_encode %{
 3308     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3309   %}
 3310   ins_pipe(pipe_slow);
 3311 %}
 3312 
 3313 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3314   predicate(UseAVX > 0);
 3315   match(Set dst (MulD src1 (LoadD src2)));
 3316 
 3317   format %{ "vmulsd  $dst, $src1, $src2" %}
 3318   ins_cost(150);
 3319   ins_encode %{
 3320     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3321   %}
 3322   ins_pipe(pipe_slow);
 3323 %}
 3324 
 3325 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3326   predicate(UseAVX > 0);
 3327   match(Set dst (MulD src con));
 3328 
 3329   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3330   ins_cost(150);
 3331   ins_encode %{
 3332     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3333   %}
 3334   ins_pipe(pipe_slow);
 3335 %}
 3336 
 3337 instruct divF_reg(regF dst, regF src) %{
 3338   predicate(UseAVX == 0);
 3339   match(Set dst (DivF dst src));
 3340 
 3341   format %{ "divss   $dst, $src" %}
 3342   ins_cost(150);
 3343   ins_encode %{
 3344     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3345   %}
 3346   ins_pipe(pipe_slow);
 3347 %}
 3348 
 3349 instruct divF_mem(regF dst, memory src) %{
 3350   predicate(UseAVX == 0);
 3351   match(Set dst (DivF dst (LoadF src)));
 3352 
 3353   format %{ "divss   $dst, $src" %}
 3354   ins_cost(150);
 3355   ins_encode %{
 3356     __ divss($dst$$XMMRegister, $src$$Address);
 3357   %}
 3358   ins_pipe(pipe_slow);
 3359 %}
 3360 
 3361 instruct divF_imm(regF dst, immF con) %{
 3362   predicate(UseAVX == 0);
 3363   match(Set dst (DivF dst con));
 3364   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3365   ins_cost(150);
 3366   ins_encode %{
 3367     __ divss($dst$$XMMRegister, $constantaddress($con));
 3368   %}
 3369   ins_pipe(pipe_slow);
 3370 %}
 3371 
 3372 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3373   predicate(UseAVX > 0);
 3374   match(Set dst (DivF src1 src2));
 3375 
 3376   format %{ "vdivss  $dst, $src1, $src2" %}
 3377   ins_cost(150);
 3378   ins_encode %{
 3379     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3380   %}
 3381   ins_pipe(pipe_slow);
 3382 %}
 3383 
 3384 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3385   predicate(UseAVX > 0);
 3386   match(Set dst (DivF src1 (LoadF src2)));
 3387 
 3388   format %{ "vdivss  $dst, $src1, $src2" %}
 3389   ins_cost(150);
 3390   ins_encode %{
 3391     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3392   %}
 3393   ins_pipe(pipe_slow);
 3394 %}
 3395 
 3396 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3397   predicate(UseAVX > 0);
 3398   match(Set dst (DivF src con));
 3399 
 3400   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3401   ins_cost(150);
 3402   ins_encode %{
 3403     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3404   %}
 3405   ins_pipe(pipe_slow);
 3406 %}
 3407 
 3408 instruct divD_reg(regD dst, regD src) %{
 3409   predicate(UseAVX == 0);
 3410   match(Set dst (DivD dst src));
 3411 
 3412   format %{ "divsd   $dst, $src" %}
 3413   ins_cost(150);
 3414   ins_encode %{
 3415     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3416   %}
 3417   ins_pipe(pipe_slow);
 3418 %}
 3419 
 3420 instruct divD_mem(regD dst, memory src) %{
 3421   predicate(UseAVX == 0);
 3422   match(Set dst (DivD dst (LoadD src)));
 3423 
 3424   format %{ "divsd   $dst, $src" %}
 3425   ins_cost(150);
 3426   ins_encode %{
 3427     __ divsd($dst$$XMMRegister, $src$$Address);
 3428   %}
 3429   ins_pipe(pipe_slow);
 3430 %}
 3431 
 3432 instruct divD_imm(regD dst, immD con) %{
 3433   predicate(UseAVX == 0);
 3434   match(Set dst (DivD dst con));
 3435   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3436   ins_cost(150);
 3437   ins_encode %{
 3438     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3439   %}
 3440   ins_pipe(pipe_slow);
 3441 %}
 3442 
 3443 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3444   predicate(UseAVX > 0);
 3445   match(Set dst (DivD src1 src2));
 3446 
 3447   format %{ "vdivsd  $dst, $src1, $src2" %}
 3448   ins_cost(150);
 3449   ins_encode %{
 3450     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3451   %}
 3452   ins_pipe(pipe_slow);
 3453 %}
 3454 
 3455 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3456   predicate(UseAVX > 0);
 3457   match(Set dst (DivD src1 (LoadD src2)));
 3458 
 3459   format %{ "vdivsd  $dst, $src1, $src2" %}
 3460   ins_cost(150);
 3461   ins_encode %{
 3462     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3463   %}
 3464   ins_pipe(pipe_slow);
 3465 %}
 3466 
 3467 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3468   predicate(UseAVX > 0);
 3469   match(Set dst (DivD src con));
 3470 
 3471   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3472   ins_cost(150);
 3473   ins_encode %{
 3474     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3475   %}
 3476   ins_pipe(pipe_slow);
 3477 %}
 3478 
 3479 instruct absF_reg(regF dst) %{
 3480   predicate(UseAVX == 0);
 3481   match(Set dst (AbsF dst));
 3482   ins_cost(150);
 3483   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3484   ins_encode %{
 3485     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3486   %}
 3487   ins_pipe(pipe_slow);
 3488 %}
 3489 
 3490 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3491   predicate(UseAVX > 0);
 3492   match(Set dst (AbsF src));
 3493   ins_cost(150);
 3494   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3495   ins_encode %{
 3496     int vlen_enc = Assembler::AVX_128bit;
 3497     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3498               ExternalAddress(float_signmask()), vlen_enc);
 3499   %}
 3500   ins_pipe(pipe_slow);
 3501 %}
 3502 
 3503 instruct absD_reg(regD dst) %{
 3504   predicate(UseAVX == 0);
 3505   match(Set dst (AbsD dst));
 3506   ins_cost(150);
 3507   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3508             "# abs double by sign masking" %}
 3509   ins_encode %{
 3510     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3511   %}
 3512   ins_pipe(pipe_slow);
 3513 %}
 3514 
 3515 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3516   predicate(UseAVX > 0);
 3517   match(Set dst (AbsD src));
 3518   ins_cost(150);
 3519   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3520             "# abs double by sign masking" %}
 3521   ins_encode %{
 3522     int vlen_enc = Assembler::AVX_128bit;
 3523     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3524               ExternalAddress(double_signmask()), vlen_enc);
 3525   %}
 3526   ins_pipe(pipe_slow);
 3527 %}
 3528 
 3529 instruct negF_reg(regF dst) %{
 3530   predicate(UseAVX == 0);
 3531   match(Set dst (NegF dst));
 3532   ins_cost(150);
 3533   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3534   ins_encode %{
 3535     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3536   %}
 3537   ins_pipe(pipe_slow);
 3538 %}
 3539 
 3540 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3541   predicate(UseAVX > 0);
 3542   match(Set dst (NegF src));
 3543   ins_cost(150);
 3544   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3545   ins_encode %{
 3546     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3547                  ExternalAddress(float_signflip()));
 3548   %}
 3549   ins_pipe(pipe_slow);
 3550 %}
 3551 
 3552 instruct negD_reg(regD dst) %{
 3553   predicate(UseAVX == 0);
 3554   match(Set dst (NegD dst));
 3555   ins_cost(150);
 3556   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3557             "# neg double by sign flipping" %}
 3558   ins_encode %{
 3559     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3560   %}
 3561   ins_pipe(pipe_slow);
 3562 %}
 3563 
 3564 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3565   predicate(UseAVX > 0);
 3566   match(Set dst (NegD src));
 3567   ins_cost(150);
 3568   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3569             "# neg double by sign flipping" %}
 3570   ins_encode %{
 3571     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3572                  ExternalAddress(double_signflip()));
 3573   %}
 3574   ins_pipe(pipe_slow);
 3575 %}
 3576 
 3577 // sqrtss instruction needs destination register to be pre initialized for best performance
 3578 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3579 instruct sqrtF_reg(regF dst) %{
 3580   match(Set dst (SqrtF dst));
 3581   format %{ "sqrtss  $dst, $dst" %}
 3582   ins_encode %{
 3583     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3584   %}
 3585   ins_pipe(pipe_slow);
 3586 %}
 3587 
 3588 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3589 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3590 instruct sqrtD_reg(regD dst) %{
 3591   match(Set dst (SqrtD dst));
 3592   format %{ "sqrtsd  $dst, $dst" %}
 3593   ins_encode %{
 3594     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3595   %}
 3596   ins_pipe(pipe_slow);
 3597 %}
 3598 
 3599 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3600   effect(TEMP tmp);
 3601   match(Set dst (ConvF2HF src));
 3602   ins_cost(125);
 3603   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3604   ins_encode %{
 3605     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3606   %}
 3607   ins_pipe( pipe_slow );
 3608 %}
 3609 
 3610 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3611   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3612   effect(TEMP ktmp, TEMP rtmp);
 3613   match(Set mem (StoreC mem (ConvF2HF src)));
 3614   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3615   ins_encode %{
 3616     __ movl($rtmp$$Register, 0x1);
 3617     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3618     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3619   %}
 3620   ins_pipe( pipe_slow );
 3621 %}
 3622 
 3623 instruct vconvF2HF(vec dst, vec src) %{
 3624   match(Set dst (VectorCastF2HF src));
 3625   format %{ "vector_conv_F2HF $dst $src" %}
 3626   ins_encode %{
 3627     int vlen_enc = vector_length_encoding(this, $src);
 3628     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3629   %}
 3630   ins_pipe( pipe_slow );
 3631 %}
 3632 
 3633 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3634   predicate(n->as_StoreVector()->memory_size() >= 16);
 3635   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3636   format %{ "vcvtps2ph $mem,$src" %}
 3637   ins_encode %{
 3638     int vlen_enc = vector_length_encoding(this, $src);
 3639     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3640   %}
 3641   ins_pipe( pipe_slow );
 3642 %}
 3643 
 3644 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3645   match(Set dst (ConvHF2F src));
 3646   format %{ "vcvtph2ps $dst,$src" %}
 3647   ins_encode %{
 3648     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3649   %}
 3650   ins_pipe( pipe_slow );
 3651 %}
 3652 
 3653 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3654   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3655   format %{ "vcvtph2ps $dst,$mem" %}
 3656   ins_encode %{
 3657     int vlen_enc = vector_length_encoding(this);
 3658     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3659   %}
 3660   ins_pipe( pipe_slow );
 3661 %}
 3662 
 3663 instruct vconvHF2F(vec dst, vec src) %{
 3664   match(Set dst (VectorCastHF2F src));
 3665   ins_cost(125);
 3666   format %{ "vector_conv_HF2F $dst,$src" %}
 3667   ins_encode %{
 3668     int vlen_enc = vector_length_encoding(this);
 3669     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3670   %}
 3671   ins_pipe( pipe_slow );
 3672 %}
 3673 
 3674 // ---------------------------------------- VectorReinterpret ------------------------------------
 3675 instruct reinterpret_mask(kReg dst) %{
 3676   predicate(n->bottom_type()->isa_vectmask() &&
 3677             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3678   match(Set dst (VectorReinterpret dst));
 3679   ins_cost(125);
 3680   format %{ "vector_reinterpret $dst\t!" %}
 3681   ins_encode %{
 3682     // empty
 3683   %}
 3684   ins_pipe( pipe_slow );
 3685 %}
 3686 
 3687 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3688   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3689             n->bottom_type()->isa_vectmask() &&
 3690             n->in(1)->bottom_type()->isa_vectmask() &&
 3691             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3692             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3693   match(Set dst (VectorReinterpret src));
 3694   effect(TEMP xtmp);
 3695   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3696   ins_encode %{
 3697      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3698      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3699      assert(src_sz == dst_sz , "src and dst size mismatch");
 3700      int vlen_enc = vector_length_encoding(src_sz);
 3701      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3702      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3703   %}
 3704   ins_pipe( pipe_slow );
 3705 %}
 3706 
 3707 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3708   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3709             n->bottom_type()->isa_vectmask() &&
 3710             n->in(1)->bottom_type()->isa_vectmask() &&
 3711             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3712              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3713             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3714   match(Set dst (VectorReinterpret src));
 3715   effect(TEMP xtmp);
 3716   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3717   ins_encode %{
 3718      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3719      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3720      assert(src_sz == dst_sz , "src and dst size mismatch");
 3721      int vlen_enc = vector_length_encoding(src_sz);
 3722      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3723      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3724   %}
 3725   ins_pipe( pipe_slow );
 3726 %}
 3727 
 3728 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3729   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3730             n->bottom_type()->isa_vectmask() &&
 3731             n->in(1)->bottom_type()->isa_vectmask() &&
 3732             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3733              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3734             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3735   match(Set dst (VectorReinterpret src));
 3736   effect(TEMP xtmp);
 3737   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3738   ins_encode %{
 3739      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3740      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3741      assert(src_sz == dst_sz , "src and dst size mismatch");
 3742      int vlen_enc = vector_length_encoding(src_sz);
 3743      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3744      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3745   %}
 3746   ins_pipe( pipe_slow );
 3747 %}
 3748 
 3749 instruct reinterpret(vec dst) %{
 3750   predicate(!n->bottom_type()->isa_vectmask() &&
 3751             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3752   match(Set dst (VectorReinterpret dst));
 3753   ins_cost(125);
 3754   format %{ "vector_reinterpret $dst\t!" %}
 3755   ins_encode %{
 3756     // empty
 3757   %}
 3758   ins_pipe( pipe_slow );
 3759 %}
 3760 
 3761 instruct reinterpret_expand(vec dst, vec src) %{
 3762   predicate(UseAVX == 0 &&
 3763             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3764   match(Set dst (VectorReinterpret src));
 3765   ins_cost(125);
 3766   effect(TEMP dst);
 3767   format %{ "vector_reinterpret_expand $dst,$src" %}
 3768   ins_encode %{
 3769     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3770     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3771 
 3772     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3773     if (src_vlen_in_bytes == 4) {
 3774       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3775     } else {
 3776       assert(src_vlen_in_bytes == 8, "");
 3777       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3778     }
 3779     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3780   %}
 3781   ins_pipe( pipe_slow );
 3782 %}
 3783 
 3784 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3785   predicate(UseAVX > 0 &&
 3786             !n->bottom_type()->isa_vectmask() &&
 3787             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3788             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3789   match(Set dst (VectorReinterpret src));
 3790   ins_cost(125);
 3791   format %{ "vector_reinterpret_expand $dst,$src" %}
 3792   ins_encode %{
 3793     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3794   %}
 3795   ins_pipe( pipe_slow );
 3796 %}
 3797 
 3798 
 3799 instruct vreinterpret_expand(legVec dst, vec src) %{
 3800   predicate(UseAVX > 0 &&
 3801             !n->bottom_type()->isa_vectmask() &&
 3802             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3803             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3804   match(Set dst (VectorReinterpret src));
 3805   ins_cost(125);
 3806   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3807   ins_encode %{
 3808     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3809       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3810       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3811       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3812       default: ShouldNotReachHere();
 3813     }
 3814   %}
 3815   ins_pipe( pipe_slow );
 3816 %}
 3817 
 3818 instruct reinterpret_shrink(vec dst, legVec src) %{
 3819   predicate(!n->bottom_type()->isa_vectmask() &&
 3820             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3821   match(Set dst (VectorReinterpret src));
 3822   ins_cost(125);
 3823   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3824   ins_encode %{
 3825     switch (Matcher::vector_length_in_bytes(this)) {
 3826       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3827       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3828       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3829       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3830       default: ShouldNotReachHere();
 3831     }
 3832   %}
 3833   ins_pipe( pipe_slow );
 3834 %}
 3835 
 3836 // ----------------------------------------------------------------------------------------------------
 3837 
 3838 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3839   match(Set dst (RoundDoubleMode src rmode));
 3840   format %{ "roundsd $dst,$src" %}
 3841   ins_cost(150);
 3842   ins_encode %{
 3843     assert(UseSSE >= 4, "required");
 3844     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3845       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3846     }
 3847     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3848   %}
 3849   ins_pipe(pipe_slow);
 3850 %}
 3851 
 3852 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3853   match(Set dst (RoundDoubleMode con rmode));
 3854   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3855   ins_cost(150);
 3856   ins_encode %{
 3857     assert(UseSSE >= 4, "required");
 3858     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3859   %}
 3860   ins_pipe(pipe_slow);
 3861 %}
 3862 
 3863 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3864   predicate(Matcher::vector_length(n) < 8);
 3865   match(Set dst (RoundDoubleModeV src rmode));
 3866   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3867   ins_encode %{
 3868     assert(UseAVX > 0, "required");
 3869     int vlen_enc = vector_length_encoding(this);
 3870     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3871   %}
 3872   ins_pipe( pipe_slow );
 3873 %}
 3874 
 3875 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3876   predicate(Matcher::vector_length(n) == 8);
 3877   match(Set dst (RoundDoubleModeV src rmode));
 3878   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3879   ins_encode %{
 3880     assert(UseAVX > 2, "required");
 3881     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3882   %}
 3883   ins_pipe( pipe_slow );
 3884 %}
 3885 
 3886 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3887   predicate(Matcher::vector_length(n) < 8);
 3888   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3889   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3890   ins_encode %{
 3891     assert(UseAVX > 0, "required");
 3892     int vlen_enc = vector_length_encoding(this);
 3893     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3894   %}
 3895   ins_pipe( pipe_slow );
 3896 %}
 3897 
 3898 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3899   predicate(Matcher::vector_length(n) == 8);
 3900   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3901   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3902   ins_encode %{
 3903     assert(UseAVX > 2, "required");
 3904     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3905   %}
 3906   ins_pipe( pipe_slow );
 3907 %}
 3908 
 3909 instruct onspinwait() %{
 3910   match(OnSpinWait);
 3911   ins_cost(200);
 3912 
 3913   format %{
 3914     $$template
 3915     $$emit$$"pause\t! membar_onspinwait"
 3916   %}
 3917   ins_encode %{
 3918     __ pause();
 3919   %}
 3920   ins_pipe(pipe_slow);
 3921 %}
 3922 
 3923 // a * b + c
 3924 instruct fmaD_reg(regD a, regD b, regD c) %{
 3925   match(Set c (FmaD  c (Binary a b)));
 3926   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3927   ins_cost(150);
 3928   ins_encode %{
 3929     assert(UseFMA, "Needs FMA instructions support.");
 3930     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3931   %}
 3932   ins_pipe( pipe_slow );
 3933 %}
 3934 
 3935 // a * b + c
 3936 instruct fmaF_reg(regF a, regF b, regF c) %{
 3937   match(Set c (FmaF  c (Binary a b)));
 3938   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3939   ins_cost(150);
 3940   ins_encode %{
 3941     assert(UseFMA, "Needs FMA instructions support.");
 3942     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3943   %}
 3944   ins_pipe( pipe_slow );
 3945 %}
 3946 
 3947 // ====================VECTOR INSTRUCTIONS=====================================
 3948 
 3949 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3950 instruct MoveVec2Leg(legVec dst, vec src) %{
 3951   match(Set dst src);
 3952   format %{ "" %}
 3953   ins_encode %{
 3954     ShouldNotReachHere();
 3955   %}
 3956   ins_pipe( fpu_reg_reg );
 3957 %}
 3958 
 3959 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3960   match(Set dst src);
 3961   format %{ "" %}
 3962   ins_encode %{
 3963     ShouldNotReachHere();
 3964   %}
 3965   ins_pipe( fpu_reg_reg );
 3966 %}
 3967 
 3968 // ============================================================================
 3969 
 3970 // Load vectors generic operand pattern
 3971 instruct loadV(vec dst, memory mem) %{
 3972   match(Set dst (LoadVector mem));
 3973   ins_cost(125);
 3974   format %{ "load_vector $dst,$mem" %}
 3975   ins_encode %{
 3976     BasicType bt = Matcher::vector_element_basic_type(this);
 3977     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3978   %}
 3979   ins_pipe( pipe_slow );
 3980 %}
 3981 
 3982 // Store vectors generic operand pattern.
 3983 instruct storeV(memory mem, vec src) %{
 3984   match(Set mem (StoreVector mem src));
 3985   ins_cost(145);
 3986   format %{ "store_vector $mem,$src\n\t" %}
 3987   ins_encode %{
 3988     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3989       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3990       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3991       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3992       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3993       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3994       default: ShouldNotReachHere();
 3995     }
 3996   %}
 3997   ins_pipe( pipe_slow );
 3998 %}
 3999 
 4000 // ---------------------------------------- Gather ------------------------------------
 4001 
 4002 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4003 
 4004 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4005   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4006             Matcher::vector_length_in_bytes(n) <= 32);
 4007   match(Set dst (LoadVectorGather mem idx));
 4008   effect(TEMP dst, TEMP tmp, TEMP mask);
 4009   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4010   ins_encode %{
 4011     int vlen_enc = vector_length_encoding(this);
 4012     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4013     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4014     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4015     __ lea($tmp$$Register, $mem$$Address);
 4016     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4017   %}
 4018   ins_pipe( pipe_slow );
 4019 %}
 4020 
 4021 
 4022 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4023   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4024             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4025   match(Set dst (LoadVectorGather mem idx));
 4026   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4027   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4028   ins_encode %{
 4029     int vlen_enc = vector_length_encoding(this);
 4030     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4031     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4032     __ lea($tmp$$Register, $mem$$Address);
 4033     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4034   %}
 4035   ins_pipe( pipe_slow );
 4036 %}
 4037 
 4038 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4039   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4040             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4041   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4042   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4043   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4044   ins_encode %{
 4045     assert(UseAVX > 2, "sanity");
 4046     int vlen_enc = vector_length_encoding(this);
 4047     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4048     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4049     // Note: Since gather instruction partially updates the opmask register used
 4050     // for predication hense moving mask operand to a temporary.
 4051     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4052     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4053     __ lea($tmp$$Register, $mem$$Address);
 4054     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4055   %}
 4056   ins_pipe( pipe_slow );
 4057 %}
 4058 
 4059 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4060   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4061   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4062   effect(TEMP tmp, TEMP rtmp);
 4063   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4064   ins_encode %{
 4065     int vlen_enc = vector_length_encoding(this);
 4066     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4067     __ lea($tmp$$Register, $mem$$Address);
 4068     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4069   %}
 4070   ins_pipe( pipe_slow );
 4071 %}
 4072 
 4073 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4074                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4075   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4076   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4077   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4078   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4079   ins_encode %{
 4080     int vlen_enc = vector_length_encoding(this);
 4081     int vector_len = Matcher::vector_length(this);
 4082     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4083     __ lea($tmp$$Register, $mem$$Address);
 4084     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4085     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4086                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4087   %}
 4088   ins_pipe( pipe_slow );
 4089 %}
 4090 
 4091 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4092   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4093   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4094   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4095   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4096   ins_encode %{
 4097     int vlen_enc = vector_length_encoding(this);
 4098     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4099     __ lea($tmp$$Register, $mem$$Address);
 4100     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4101   %}
 4102   ins_pipe( pipe_slow );
 4103 %}
 4104 
 4105 
 4106 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4107                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4108   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4109   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4110   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4111   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4112   ins_encode %{
 4113     int vlen_enc = vector_length_encoding(this);
 4114     int vector_len = Matcher::vector_length(this);
 4115     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4116     __ lea($tmp$$Register, $mem$$Address);
 4117     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4118     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4119                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4120   %}
 4121   ins_pipe( pipe_slow );
 4122 %}
 4123 
 4124 
 4125 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4126   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4127   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4128   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4129   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4130   ins_encode %{
 4131     int vlen_enc = vector_length_encoding(this);
 4132     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4133     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4134     __ lea($tmp$$Register, $mem$$Address);
 4135     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4136     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4137   %}
 4138   ins_pipe( pipe_slow );
 4139 %}
 4140 
 4141 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4142                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4143   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4144   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4145   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4146   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4147   ins_encode %{
 4148     int vlen_enc = vector_length_encoding(this);
 4149     int vector_len = Matcher::vector_length(this);
 4150     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4151     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4152     __ lea($tmp$$Register, $mem$$Address);
 4153     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4154     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4155     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4156                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4157   %}
 4158   ins_pipe( pipe_slow );
 4159 %}
 4160 
 4161 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4162   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4163   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4164   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4165   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4166   ins_encode %{
 4167     int vlen_enc = vector_length_encoding(this);
 4168     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4169     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4170     __ lea($tmp$$Register, $mem$$Address);
 4171     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4172     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4173                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4174   %}
 4175   ins_pipe( pipe_slow );
 4176 %}
 4177 
 4178 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4179                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4180   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4181   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4182   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4183   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4184   ins_encode %{
 4185     int vlen_enc = vector_length_encoding(this);
 4186     int vector_len = Matcher::vector_length(this);
 4187     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4188     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4189     __ lea($tmp$$Register, $mem$$Address);
 4190     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4191     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4192     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4193                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4194   %}
 4195   ins_pipe( pipe_slow );
 4196 %}
 4197 
 4198 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4199   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4200   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4201   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4202   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4203   ins_encode %{
 4204     int vlen_enc = vector_length_encoding(this);
 4205     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4206     __ lea($tmp$$Register, $mem$$Address);
 4207     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4208     if (elem_bt == T_SHORT) {
 4209       __ movl($mask_idx$$Register, 0x55555555);
 4210       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4211     }
 4212     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4213     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4214   %}
 4215   ins_pipe( pipe_slow );
 4216 %}
 4217 
 4218 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4219                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4220   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4221   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4222   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4223   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4224   ins_encode %{
 4225     int vlen_enc = vector_length_encoding(this);
 4226     int vector_len = Matcher::vector_length(this);
 4227     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4228     __ lea($tmp$$Register, $mem$$Address);
 4229     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4230     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4231     if (elem_bt == T_SHORT) {
 4232       __ movl($mask_idx$$Register, 0x55555555);
 4233       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4234     }
 4235     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4236     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4237                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4238   %}
 4239   ins_pipe( pipe_slow );
 4240 %}
 4241 
 4242 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4243   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4244   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4245   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4246   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4247   ins_encode %{
 4248     int vlen_enc = vector_length_encoding(this);
 4249     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4250     __ lea($tmp$$Register, $mem$$Address);
 4251     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4252     if (elem_bt == T_SHORT) {
 4253       __ movl($mask_idx$$Register, 0x55555555);
 4254       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4255     }
 4256     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4257     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4258                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4259   %}
 4260   ins_pipe( pipe_slow );
 4261 %}
 4262 
 4263 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4264                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4265   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4266   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4267   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4268   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4269   ins_encode %{
 4270     int vlen_enc = vector_length_encoding(this);
 4271     int vector_len = Matcher::vector_length(this);
 4272     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4273     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4274     __ lea($tmp$$Register, $mem$$Address);
 4275     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4276     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4277     if (elem_bt == T_SHORT) {
 4278       __ movl($mask_idx$$Register, 0x55555555);
 4279       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4280     }
 4281     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4282     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4283                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4284   %}
 4285   ins_pipe( pipe_slow );
 4286 %}
 4287 
 4288 // ====================Scatter=======================================
 4289 
 4290 // Scatter INT, LONG, FLOAT, DOUBLE
 4291 
 4292 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4293   predicate(UseAVX > 2);
 4294   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4295   effect(TEMP tmp, TEMP ktmp);
 4296   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4297   ins_encode %{
 4298     int vlen_enc = vector_length_encoding(this, $src);
 4299     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4300 
 4301     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4302     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4303 
 4304     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4305     __ lea($tmp$$Register, $mem$$Address);
 4306     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4307   %}
 4308   ins_pipe( pipe_slow );
 4309 %}
 4310 
 4311 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4312   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4313   effect(TEMP tmp, TEMP ktmp);
 4314   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4315   ins_encode %{
 4316     int vlen_enc = vector_length_encoding(this, $src);
 4317     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4318     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4319     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4320     // Note: Since scatter instruction partially updates the opmask register used
 4321     // for predication hense moving mask operand to a temporary.
 4322     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4323     __ lea($tmp$$Register, $mem$$Address);
 4324     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4325   %}
 4326   ins_pipe( pipe_slow );
 4327 %}
 4328 
 4329 // ====================REPLICATE=======================================
 4330 
 4331 // Replicate byte scalar to be vector
 4332 instruct vReplB_reg(vec dst, rRegI src) %{
 4333   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4334   match(Set dst (Replicate src));
 4335   format %{ "replicateB $dst,$src" %}
 4336   ins_encode %{
 4337     uint vlen = Matcher::vector_length(this);
 4338     if (UseAVX >= 2) {
 4339       int vlen_enc = vector_length_encoding(this);
 4340       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4341         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4342         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4343       } else {
 4344         __ movdl($dst$$XMMRegister, $src$$Register);
 4345         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4346       }
 4347     } else {
 4348        assert(UseAVX < 2, "");
 4349       __ movdl($dst$$XMMRegister, $src$$Register);
 4350       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4351       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4352       if (vlen >= 16) {
 4353         assert(vlen == 16, "");
 4354         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4355       }
 4356     }
 4357   %}
 4358   ins_pipe( pipe_slow );
 4359 %}
 4360 
 4361 instruct ReplB_mem(vec dst, memory mem) %{
 4362   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4363   match(Set dst (Replicate (LoadB mem)));
 4364   format %{ "replicateB $dst,$mem" %}
 4365   ins_encode %{
 4366     int vlen_enc = vector_length_encoding(this);
 4367     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4368   %}
 4369   ins_pipe( pipe_slow );
 4370 %}
 4371 
 4372 // ====================ReplicateS=======================================
 4373 
 4374 instruct vReplS_reg(vec dst, rRegI src) %{
 4375   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4376   match(Set dst (Replicate src));
 4377   format %{ "replicateS $dst,$src" %}
 4378   ins_encode %{
 4379     uint vlen = Matcher::vector_length(this);
 4380     int vlen_enc = vector_length_encoding(this);
 4381     if (UseAVX >= 2) {
 4382       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4383         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4384         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4385       } else {
 4386         __ movdl($dst$$XMMRegister, $src$$Register);
 4387         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4388       }
 4389     } else {
 4390       assert(UseAVX < 2, "");
 4391       __ movdl($dst$$XMMRegister, $src$$Register);
 4392       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4393       if (vlen >= 8) {
 4394         assert(vlen == 8, "");
 4395         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4396       }
 4397     }
 4398   %}
 4399   ins_pipe( pipe_slow );
 4400 %}
 4401 
 4402 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4403   match(Set dst (Replicate con));
 4404   effect(TEMP rtmp);
 4405   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4406   ins_encode %{
 4407     int vlen_enc = vector_length_encoding(this);
 4408     BasicType bt = Matcher::vector_element_basic_type(this);
 4409     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4410     __ movl($rtmp$$Register, $con$$constant);
 4411     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4412   %}
 4413   ins_pipe( pipe_slow );
 4414 %}
 4415 
 4416 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4417   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4418   match(Set dst (Replicate src));
 4419   effect(TEMP rtmp);
 4420   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4421   ins_encode %{
 4422     int vlen_enc = vector_length_encoding(this);
 4423     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4424     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4425   %}
 4426   ins_pipe( pipe_slow );
 4427 %}
 4428 
 4429 instruct ReplS_mem(vec dst, memory mem) %{
 4430   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4431   match(Set dst (Replicate (LoadS mem)));
 4432   format %{ "replicateS $dst,$mem" %}
 4433   ins_encode %{
 4434     int vlen_enc = vector_length_encoding(this);
 4435     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4436   %}
 4437   ins_pipe( pipe_slow );
 4438 %}
 4439 
 4440 // ====================ReplicateI=======================================
 4441 
 4442 instruct ReplI_reg(vec dst, rRegI src) %{
 4443   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4444   match(Set dst (Replicate src));
 4445   format %{ "replicateI $dst,$src" %}
 4446   ins_encode %{
 4447     uint vlen = Matcher::vector_length(this);
 4448     int vlen_enc = vector_length_encoding(this);
 4449     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4450       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4451     } else if (VM_Version::supports_avx2()) {
 4452       __ movdl($dst$$XMMRegister, $src$$Register);
 4453       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4454     } else {
 4455       __ movdl($dst$$XMMRegister, $src$$Register);
 4456       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4457     }
 4458   %}
 4459   ins_pipe( pipe_slow );
 4460 %}
 4461 
 4462 instruct ReplI_mem(vec dst, memory mem) %{
 4463   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4464   match(Set dst (Replicate (LoadI mem)));
 4465   format %{ "replicateI $dst,$mem" %}
 4466   ins_encode %{
 4467     int vlen_enc = vector_length_encoding(this);
 4468     if (VM_Version::supports_avx2()) {
 4469       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4470     } else if (VM_Version::supports_avx()) {
 4471       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4472     } else {
 4473       __ movdl($dst$$XMMRegister, $mem$$Address);
 4474       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4475     }
 4476   %}
 4477   ins_pipe( pipe_slow );
 4478 %}
 4479 
 4480 instruct ReplI_imm(vec dst, immI con) %{
 4481   predicate(Matcher::is_non_long_integral_vector(n));
 4482   match(Set dst (Replicate con));
 4483   format %{ "replicateI $dst,$con" %}
 4484   ins_encode %{
 4485     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4486                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4487                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4488     BasicType bt = Matcher::vector_element_basic_type(this);
 4489     int vlen = Matcher::vector_length_in_bytes(this);
 4490     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4491   %}
 4492   ins_pipe( pipe_slow );
 4493 %}
 4494 
 4495 // Replicate scalar zero to be vector
 4496 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4497   predicate(Matcher::is_non_long_integral_vector(n));
 4498   match(Set dst (Replicate zero));
 4499   format %{ "replicateI $dst,$zero" %}
 4500   ins_encode %{
 4501     int vlen_enc = vector_length_encoding(this);
 4502     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4503       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4504     } else {
 4505       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4506     }
 4507   %}
 4508   ins_pipe( fpu_reg_reg );
 4509 %}
 4510 
 4511 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4512   predicate(Matcher::is_non_long_integral_vector(n));
 4513   match(Set dst (Replicate con));
 4514   format %{ "vallones $dst" %}
 4515   ins_encode %{
 4516     int vector_len = vector_length_encoding(this);
 4517     __ vallones($dst$$XMMRegister, vector_len);
 4518   %}
 4519   ins_pipe( pipe_slow );
 4520 %}
 4521 
 4522 // ====================ReplicateL=======================================
 4523 
 4524 // Replicate long (8 byte) scalar to be vector
 4525 instruct ReplL_reg(vec dst, rRegL src) %{
 4526   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4527   match(Set dst (Replicate src));
 4528   format %{ "replicateL $dst,$src" %}
 4529   ins_encode %{
 4530     int vlen = Matcher::vector_length(this);
 4531     int vlen_enc = vector_length_encoding(this);
 4532     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4533       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4534     } else if (VM_Version::supports_avx2()) {
 4535       __ movdq($dst$$XMMRegister, $src$$Register);
 4536       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4537     } else {
 4538       __ movdq($dst$$XMMRegister, $src$$Register);
 4539       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4540     }
 4541   %}
 4542   ins_pipe( pipe_slow );
 4543 %}
 4544 
 4545 instruct ReplL_mem(vec dst, memory mem) %{
 4546   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4547   match(Set dst (Replicate (LoadL mem)));
 4548   format %{ "replicateL $dst,$mem" %}
 4549   ins_encode %{
 4550     int vlen_enc = vector_length_encoding(this);
 4551     if (VM_Version::supports_avx2()) {
 4552       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4553     } else if (VM_Version::supports_sse3()) {
 4554       __ movddup($dst$$XMMRegister, $mem$$Address);
 4555     } else {
 4556       __ movq($dst$$XMMRegister, $mem$$Address);
 4557       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4558     }
 4559   %}
 4560   ins_pipe( pipe_slow );
 4561 %}
 4562 
 4563 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4564 instruct ReplL_imm(vec dst, immL con) %{
 4565   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4566   match(Set dst (Replicate con));
 4567   format %{ "replicateL $dst,$con" %}
 4568   ins_encode %{
 4569     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4570     int vlen = Matcher::vector_length_in_bytes(this);
 4571     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4572   %}
 4573   ins_pipe( pipe_slow );
 4574 %}
 4575 
 4576 instruct ReplL_zero(vec dst, immL0 zero) %{
 4577   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4578   match(Set dst (Replicate zero));
 4579   format %{ "replicateL $dst,$zero" %}
 4580   ins_encode %{
 4581     int vlen_enc = vector_length_encoding(this);
 4582     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4583       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4584     } else {
 4585       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4586     }
 4587   %}
 4588   ins_pipe( fpu_reg_reg );
 4589 %}
 4590 
 4591 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4592   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4593   match(Set dst (Replicate con));
 4594   format %{ "vallones $dst" %}
 4595   ins_encode %{
 4596     int vector_len = vector_length_encoding(this);
 4597     __ vallones($dst$$XMMRegister, vector_len);
 4598   %}
 4599   ins_pipe( pipe_slow );
 4600 %}
 4601 
 4602 // ====================ReplicateF=======================================
 4603 
 4604 instruct vReplF_reg(vec dst, vlRegF src) %{
 4605   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4606   match(Set dst (Replicate src));
 4607   format %{ "replicateF $dst,$src" %}
 4608   ins_encode %{
 4609     uint vlen = Matcher::vector_length(this);
 4610     int vlen_enc = vector_length_encoding(this);
 4611     if (vlen <= 4) {
 4612       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4613     } else if (VM_Version::supports_avx2()) {
 4614       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4615     } else {
 4616       assert(vlen == 8, "sanity");
 4617       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4618       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4619     }
 4620   %}
 4621   ins_pipe( pipe_slow );
 4622 %}
 4623 
 4624 instruct ReplF_reg(vec dst, vlRegF src) %{
 4625   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4626   match(Set dst (Replicate src));
 4627   format %{ "replicateF $dst,$src" %}
 4628   ins_encode %{
 4629     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4630   %}
 4631   ins_pipe( pipe_slow );
 4632 %}
 4633 
 4634 instruct ReplF_mem(vec dst, memory mem) %{
 4635   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4636   match(Set dst (Replicate (LoadF mem)));
 4637   format %{ "replicateF $dst,$mem" %}
 4638   ins_encode %{
 4639     int vlen_enc = vector_length_encoding(this);
 4640     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4641   %}
 4642   ins_pipe( pipe_slow );
 4643 %}
 4644 
 4645 // Replicate float scalar immediate to be vector by loading from const table.
 4646 instruct ReplF_imm(vec dst, immF con) %{
 4647   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4648   match(Set dst (Replicate con));
 4649   format %{ "replicateF $dst,$con" %}
 4650   ins_encode %{
 4651     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4652                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4653     int vlen = Matcher::vector_length_in_bytes(this);
 4654     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4655   %}
 4656   ins_pipe( pipe_slow );
 4657 %}
 4658 
 4659 instruct ReplF_zero(vec dst, immF0 zero) %{
 4660   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4661   match(Set dst (Replicate zero));
 4662   format %{ "replicateF $dst,$zero" %}
 4663   ins_encode %{
 4664     int vlen_enc = vector_length_encoding(this);
 4665     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4666       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4667     } else {
 4668       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4669     }
 4670   %}
 4671   ins_pipe( fpu_reg_reg );
 4672 %}
 4673 
 4674 // ====================ReplicateD=======================================
 4675 
 4676 // Replicate double (8 bytes) scalar to be vector
 4677 instruct vReplD_reg(vec dst, vlRegD src) %{
 4678   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4679   match(Set dst (Replicate src));
 4680   format %{ "replicateD $dst,$src" %}
 4681   ins_encode %{
 4682     uint vlen = Matcher::vector_length(this);
 4683     int vlen_enc = vector_length_encoding(this);
 4684     if (vlen <= 2) {
 4685       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4686     } else if (VM_Version::supports_avx2()) {
 4687       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4688     } else {
 4689       assert(vlen == 4, "sanity");
 4690       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4691       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4692     }
 4693   %}
 4694   ins_pipe( pipe_slow );
 4695 %}
 4696 
 4697 instruct ReplD_reg(vec dst, vlRegD src) %{
 4698   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4699   match(Set dst (Replicate src));
 4700   format %{ "replicateD $dst,$src" %}
 4701   ins_encode %{
 4702     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4703   %}
 4704   ins_pipe( pipe_slow );
 4705 %}
 4706 
 4707 instruct ReplD_mem(vec dst, memory mem) %{
 4708   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4709   match(Set dst (Replicate (LoadD mem)));
 4710   format %{ "replicateD $dst,$mem" %}
 4711   ins_encode %{
 4712     if (Matcher::vector_length(this) >= 4) {
 4713       int vlen_enc = vector_length_encoding(this);
 4714       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4715     } else {
 4716       __ movddup($dst$$XMMRegister, $mem$$Address);
 4717     }
 4718   %}
 4719   ins_pipe( pipe_slow );
 4720 %}
 4721 
 4722 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4723 instruct ReplD_imm(vec dst, immD con) %{
 4724   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4725   match(Set dst (Replicate con));
 4726   format %{ "replicateD $dst,$con" %}
 4727   ins_encode %{
 4728     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4729     int vlen = Matcher::vector_length_in_bytes(this);
 4730     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4731   %}
 4732   ins_pipe( pipe_slow );
 4733 %}
 4734 
 4735 instruct ReplD_zero(vec dst, immD0 zero) %{
 4736   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4737   match(Set dst (Replicate zero));
 4738   format %{ "replicateD $dst,$zero" %}
 4739   ins_encode %{
 4740     int vlen_enc = vector_length_encoding(this);
 4741     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4742       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4743     } else {
 4744       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4745     }
 4746   %}
 4747   ins_pipe( fpu_reg_reg );
 4748 %}
 4749 
 4750 // ====================VECTOR INSERT=======================================
 4751 
 4752 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4753   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4754   match(Set dst (VectorInsert (Binary dst val) idx));
 4755   format %{ "vector_insert $dst,$val,$idx" %}
 4756   ins_encode %{
 4757     assert(UseSSE >= 4, "required");
 4758     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4759 
 4760     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4761 
 4762     assert(is_integral_type(elem_bt), "");
 4763     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4764 
 4765     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4766   %}
 4767   ins_pipe( pipe_slow );
 4768 %}
 4769 
 4770 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4771   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4772   match(Set dst (VectorInsert (Binary src val) idx));
 4773   effect(TEMP vtmp);
 4774   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4775   ins_encode %{
 4776     int vlen_enc = Assembler::AVX_256bit;
 4777     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4778     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4779     int log2epr = log2(elem_per_lane);
 4780 
 4781     assert(is_integral_type(elem_bt), "sanity");
 4782     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4783 
 4784     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4785     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4786     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4787     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4788     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4789   %}
 4790   ins_pipe( pipe_slow );
 4791 %}
 4792 
 4793 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4794   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4795   match(Set dst (VectorInsert (Binary src val) idx));
 4796   effect(TEMP vtmp);
 4797   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4798   ins_encode %{
 4799     assert(UseAVX > 2, "sanity");
 4800 
 4801     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4802     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4803     int log2epr = log2(elem_per_lane);
 4804 
 4805     assert(is_integral_type(elem_bt), "");
 4806     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4807 
 4808     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4809     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4810     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4811     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4812     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4813   %}
 4814   ins_pipe( pipe_slow );
 4815 %}
 4816 
 4817 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4818   predicate(Matcher::vector_length(n) == 2);
 4819   match(Set dst (VectorInsert (Binary dst val) idx));
 4820   format %{ "vector_insert $dst,$val,$idx" %}
 4821   ins_encode %{
 4822     assert(UseSSE >= 4, "required");
 4823     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4824     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4825 
 4826     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4827   %}
 4828   ins_pipe( pipe_slow );
 4829 %}
 4830 
 4831 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4832   predicate(Matcher::vector_length(n) == 4);
 4833   match(Set dst (VectorInsert (Binary src val) idx));
 4834   effect(TEMP vtmp);
 4835   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4836   ins_encode %{
 4837     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4838     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4839 
 4840     uint x_idx = $idx$$constant & right_n_bits(1);
 4841     uint y_idx = ($idx$$constant >> 1) & 1;
 4842     int vlen_enc = Assembler::AVX_256bit;
 4843     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4844     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4845     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4846   %}
 4847   ins_pipe( pipe_slow );
 4848 %}
 4849 
 4850 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4851   predicate(Matcher::vector_length(n) == 8);
 4852   match(Set dst (VectorInsert (Binary src val) idx));
 4853   effect(TEMP vtmp);
 4854   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4855   ins_encode %{
 4856     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4857     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4858 
 4859     uint x_idx = $idx$$constant & right_n_bits(1);
 4860     uint y_idx = ($idx$$constant >> 1) & 3;
 4861     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4862     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4863     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4864   %}
 4865   ins_pipe( pipe_slow );
 4866 %}
 4867 
 4868 instruct insertF(vec dst, regF val, immU8 idx) %{
 4869   predicate(Matcher::vector_length(n) < 8);
 4870   match(Set dst (VectorInsert (Binary dst val) idx));
 4871   format %{ "vector_insert $dst,$val,$idx" %}
 4872   ins_encode %{
 4873     assert(UseSSE >= 4, "sanity");
 4874 
 4875     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4876     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4877 
 4878     uint x_idx = $idx$$constant & right_n_bits(2);
 4879     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4880   %}
 4881   ins_pipe( pipe_slow );
 4882 %}
 4883 
 4884 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4885   predicate(Matcher::vector_length(n) >= 8);
 4886   match(Set dst (VectorInsert (Binary src val) idx));
 4887   effect(TEMP vtmp);
 4888   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4889   ins_encode %{
 4890     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4891     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4892 
 4893     int vlen = Matcher::vector_length(this);
 4894     uint x_idx = $idx$$constant & right_n_bits(2);
 4895     if (vlen == 8) {
 4896       uint y_idx = ($idx$$constant >> 2) & 1;
 4897       int vlen_enc = Assembler::AVX_256bit;
 4898       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4899       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4900       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4901     } else {
 4902       assert(vlen == 16, "sanity");
 4903       uint y_idx = ($idx$$constant >> 2) & 3;
 4904       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4905       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4906       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4907     }
 4908   %}
 4909   ins_pipe( pipe_slow );
 4910 %}
 4911 
 4912 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4913   predicate(Matcher::vector_length(n) == 2);
 4914   match(Set dst (VectorInsert (Binary dst val) idx));
 4915   effect(TEMP tmp);
 4916   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4917   ins_encode %{
 4918     assert(UseSSE >= 4, "sanity");
 4919     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4920     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4921 
 4922     __ movq($tmp$$Register, $val$$XMMRegister);
 4923     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4924   %}
 4925   ins_pipe( pipe_slow );
 4926 %}
 4927 
 4928 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4929   predicate(Matcher::vector_length(n) == 4);
 4930   match(Set dst (VectorInsert (Binary src val) idx));
 4931   effect(TEMP vtmp, TEMP tmp);
 4932   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4933   ins_encode %{
 4934     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4935     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4936 
 4937     uint x_idx = $idx$$constant & right_n_bits(1);
 4938     uint y_idx = ($idx$$constant >> 1) & 1;
 4939     int vlen_enc = Assembler::AVX_256bit;
 4940     __ movq($tmp$$Register, $val$$XMMRegister);
 4941     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4942     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4943     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4944   %}
 4945   ins_pipe( pipe_slow );
 4946 %}
 4947 
 4948 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4949   predicate(Matcher::vector_length(n) == 8);
 4950   match(Set dst (VectorInsert (Binary src val) idx));
 4951   effect(TEMP tmp, TEMP vtmp);
 4952   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4953   ins_encode %{
 4954     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4955     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4956 
 4957     uint x_idx = $idx$$constant & right_n_bits(1);
 4958     uint y_idx = ($idx$$constant >> 1) & 3;
 4959     __ movq($tmp$$Register, $val$$XMMRegister);
 4960     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4961     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4962     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4963   %}
 4964   ins_pipe( pipe_slow );
 4965 %}
 4966 
 4967 // ====================REDUCTION ARITHMETIC=======================================
 4968 
 4969 // =======================Int Reduction==========================================
 4970 
 4971 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4972   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4973   match(Set dst (AddReductionVI src1 src2));
 4974   match(Set dst (MulReductionVI src1 src2));
 4975   match(Set dst (AndReductionV  src1 src2));
 4976   match(Set dst ( OrReductionV  src1 src2));
 4977   match(Set dst (XorReductionV  src1 src2));
 4978   match(Set dst (MinReductionV  src1 src2));
 4979   match(Set dst (MaxReductionV  src1 src2));
 4980   effect(TEMP vtmp1, TEMP vtmp2);
 4981   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4982   ins_encode %{
 4983     int opcode = this->ideal_Opcode();
 4984     int vlen = Matcher::vector_length(this, $src2);
 4985     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4986   %}
 4987   ins_pipe( pipe_slow );
 4988 %}
 4989 
 4990 // =======================Long Reduction==========================================
 4991 
 4992 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4993   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4994   match(Set dst (AddReductionVL src1 src2));
 4995   match(Set dst (MulReductionVL src1 src2));
 4996   match(Set dst (AndReductionV  src1 src2));
 4997   match(Set dst ( OrReductionV  src1 src2));
 4998   match(Set dst (XorReductionV  src1 src2));
 4999   match(Set dst (MinReductionV  src1 src2));
 5000   match(Set dst (MaxReductionV  src1 src2));
 5001   effect(TEMP vtmp1, TEMP vtmp2);
 5002   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5003   ins_encode %{
 5004     int opcode = this->ideal_Opcode();
 5005     int vlen = Matcher::vector_length(this, $src2);
 5006     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5007   %}
 5008   ins_pipe( pipe_slow );
 5009 %}
 5010 
 5011 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5012   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5013   match(Set dst (AddReductionVL src1 src2));
 5014   match(Set dst (MulReductionVL src1 src2));
 5015   match(Set dst (AndReductionV  src1 src2));
 5016   match(Set dst ( OrReductionV  src1 src2));
 5017   match(Set dst (XorReductionV  src1 src2));
 5018   match(Set dst (MinReductionV  src1 src2));
 5019   match(Set dst (MaxReductionV  src1 src2));
 5020   effect(TEMP vtmp1, TEMP vtmp2);
 5021   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5022   ins_encode %{
 5023     int opcode = this->ideal_Opcode();
 5024     int vlen = Matcher::vector_length(this, $src2);
 5025     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5026   %}
 5027   ins_pipe( pipe_slow );
 5028 %}
 5029 
 5030 // =======================Float Reduction==========================================
 5031 
 5032 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5033   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5034   match(Set dst (AddReductionVF dst src));
 5035   match(Set dst (MulReductionVF dst src));
 5036   effect(TEMP dst, TEMP vtmp);
 5037   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5038   ins_encode %{
 5039     int opcode = this->ideal_Opcode();
 5040     int vlen = Matcher::vector_length(this, $src);
 5041     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5042   %}
 5043   ins_pipe( pipe_slow );
 5044 %}
 5045 
 5046 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5047   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5048   match(Set dst (AddReductionVF dst src));
 5049   match(Set dst (MulReductionVF dst src));
 5050   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5051   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5052   ins_encode %{
 5053     int opcode = this->ideal_Opcode();
 5054     int vlen = Matcher::vector_length(this, $src);
 5055     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5056   %}
 5057   ins_pipe( pipe_slow );
 5058 %}
 5059 
 5060 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5061   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5062   match(Set dst (AddReductionVF dst src));
 5063   match(Set dst (MulReductionVF dst src));
 5064   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5065   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5066   ins_encode %{
 5067     int opcode = this->ideal_Opcode();
 5068     int vlen = Matcher::vector_length(this, $src);
 5069     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5070   %}
 5071   ins_pipe( pipe_slow );
 5072 %}
 5073 
 5074 
 5075 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5076   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5077   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5078   // src1 contains reduction identity
 5079   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5080   match(Set dst (AddReductionVF src1 src2));
 5081   match(Set dst (MulReductionVF src1 src2));
 5082   effect(TEMP dst);
 5083   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5084   ins_encode %{
 5085     int opcode = this->ideal_Opcode();
 5086     int vlen = Matcher::vector_length(this, $src2);
 5087     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5088   %}
 5089   ins_pipe( pipe_slow );
 5090 %}
 5091 
 5092 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5093   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5094   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5095   // src1 contains reduction identity
 5096   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5097   match(Set dst (AddReductionVF src1 src2));
 5098   match(Set dst (MulReductionVF src1 src2));
 5099   effect(TEMP dst, TEMP vtmp);
 5100   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5101   ins_encode %{
 5102     int opcode = this->ideal_Opcode();
 5103     int vlen = Matcher::vector_length(this, $src2);
 5104     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5105   %}
 5106   ins_pipe( pipe_slow );
 5107 %}
 5108 
 5109 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5110   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5111   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5112   // src1 contains reduction identity
 5113   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5114   match(Set dst (AddReductionVF src1 src2));
 5115   match(Set dst (MulReductionVF src1 src2));
 5116   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5117   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5118   ins_encode %{
 5119     int opcode = this->ideal_Opcode();
 5120     int vlen = Matcher::vector_length(this, $src2);
 5121     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5122   %}
 5123   ins_pipe( pipe_slow );
 5124 %}
 5125 
 5126 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5127   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5128   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5129   // src1 contains reduction identity
 5130   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5131   match(Set dst (AddReductionVF src1 src2));
 5132   match(Set dst (MulReductionVF src1 src2));
 5133   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5134   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5135   ins_encode %{
 5136     int opcode = this->ideal_Opcode();
 5137     int vlen = Matcher::vector_length(this, $src2);
 5138     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5139   %}
 5140   ins_pipe( pipe_slow );
 5141 %}
 5142 
 5143 // =======================Double Reduction==========================================
 5144 
 5145 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5146   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5147   match(Set dst (AddReductionVD dst src));
 5148   match(Set dst (MulReductionVD dst src));
 5149   effect(TEMP dst, TEMP vtmp);
 5150   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5151   ins_encode %{
 5152     int opcode = this->ideal_Opcode();
 5153     int vlen = Matcher::vector_length(this, $src);
 5154     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5155 %}
 5156   ins_pipe( pipe_slow );
 5157 %}
 5158 
 5159 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5160   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5161   match(Set dst (AddReductionVD dst src));
 5162   match(Set dst (MulReductionVD dst src));
 5163   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5164   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5165   ins_encode %{
 5166     int opcode = this->ideal_Opcode();
 5167     int vlen = Matcher::vector_length(this, $src);
 5168     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5169   %}
 5170   ins_pipe( pipe_slow );
 5171 %}
 5172 
 5173 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5174   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5175   match(Set dst (AddReductionVD dst src));
 5176   match(Set dst (MulReductionVD dst src));
 5177   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5178   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5179   ins_encode %{
 5180     int opcode = this->ideal_Opcode();
 5181     int vlen = Matcher::vector_length(this, $src);
 5182     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5183   %}
 5184   ins_pipe( pipe_slow );
 5185 %}
 5186 
 5187 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5188   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5189   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5190   // src1 contains reduction identity
 5191   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5192   match(Set dst (AddReductionVD src1 src2));
 5193   match(Set dst (MulReductionVD src1 src2));
 5194   effect(TEMP dst);
 5195   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5196   ins_encode %{
 5197     int opcode = this->ideal_Opcode();
 5198     int vlen = Matcher::vector_length(this, $src2);
 5199     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5200 %}
 5201   ins_pipe( pipe_slow );
 5202 %}
 5203 
 5204 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5205   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5206   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5207   // src1 contains reduction identity
 5208   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5209   match(Set dst (AddReductionVD src1 src2));
 5210   match(Set dst (MulReductionVD src1 src2));
 5211   effect(TEMP dst, TEMP vtmp);
 5212   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5213   ins_encode %{
 5214     int opcode = this->ideal_Opcode();
 5215     int vlen = Matcher::vector_length(this, $src2);
 5216     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5217   %}
 5218   ins_pipe( pipe_slow );
 5219 %}
 5220 
 5221 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5222   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5223   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5224   // src1 contains reduction identity
 5225   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5226   match(Set dst (AddReductionVD src1 src2));
 5227   match(Set dst (MulReductionVD src1 src2));
 5228   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5229   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5230   ins_encode %{
 5231     int opcode = this->ideal_Opcode();
 5232     int vlen = Matcher::vector_length(this, $src2);
 5233     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5234   %}
 5235   ins_pipe( pipe_slow );
 5236 %}
 5237 
 5238 // =======================Byte Reduction==========================================
 5239 
 5240 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5241   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5242   match(Set dst (AddReductionVI src1 src2));
 5243   match(Set dst (AndReductionV  src1 src2));
 5244   match(Set dst ( OrReductionV  src1 src2));
 5245   match(Set dst (XorReductionV  src1 src2));
 5246   match(Set dst (MinReductionV  src1 src2));
 5247   match(Set dst (MaxReductionV  src1 src2));
 5248   effect(TEMP vtmp1, TEMP vtmp2);
 5249   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5250   ins_encode %{
 5251     int opcode = this->ideal_Opcode();
 5252     int vlen = Matcher::vector_length(this, $src2);
 5253     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5254   %}
 5255   ins_pipe( pipe_slow );
 5256 %}
 5257 
 5258 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5259   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5260   match(Set dst (AddReductionVI src1 src2));
 5261   match(Set dst (AndReductionV  src1 src2));
 5262   match(Set dst ( OrReductionV  src1 src2));
 5263   match(Set dst (XorReductionV  src1 src2));
 5264   match(Set dst (MinReductionV  src1 src2));
 5265   match(Set dst (MaxReductionV  src1 src2));
 5266   effect(TEMP vtmp1, TEMP vtmp2);
 5267   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5268   ins_encode %{
 5269     int opcode = this->ideal_Opcode();
 5270     int vlen = Matcher::vector_length(this, $src2);
 5271     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5272   %}
 5273   ins_pipe( pipe_slow );
 5274 %}
 5275 
 5276 // =======================Short Reduction==========================================
 5277 
 5278 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5279   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5280   match(Set dst (AddReductionVI src1 src2));
 5281   match(Set dst (MulReductionVI src1 src2));
 5282   match(Set dst (AndReductionV  src1 src2));
 5283   match(Set dst ( OrReductionV  src1 src2));
 5284   match(Set dst (XorReductionV  src1 src2));
 5285   match(Set dst (MinReductionV  src1 src2));
 5286   match(Set dst (MaxReductionV  src1 src2));
 5287   effect(TEMP vtmp1, TEMP vtmp2);
 5288   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5289   ins_encode %{
 5290     int opcode = this->ideal_Opcode();
 5291     int vlen = Matcher::vector_length(this, $src2);
 5292     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5293   %}
 5294   ins_pipe( pipe_slow );
 5295 %}
 5296 
 5297 // =======================Mul Reduction==========================================
 5298 
 5299 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5300   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5301             Matcher::vector_length(n->in(2)) <= 32); // src2
 5302   match(Set dst (MulReductionVI src1 src2));
 5303   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5304   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5305   ins_encode %{
 5306     int opcode = this->ideal_Opcode();
 5307     int vlen = Matcher::vector_length(this, $src2);
 5308     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5309   %}
 5310   ins_pipe( pipe_slow );
 5311 %}
 5312 
 5313 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5314   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5315             Matcher::vector_length(n->in(2)) == 64); // src2
 5316   match(Set dst (MulReductionVI src1 src2));
 5317   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5318   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5319   ins_encode %{
 5320     int opcode = this->ideal_Opcode();
 5321     int vlen = Matcher::vector_length(this, $src2);
 5322     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5323   %}
 5324   ins_pipe( pipe_slow );
 5325 %}
 5326 
 5327 //--------------------Min/Max Float Reduction --------------------
 5328 // Float Min Reduction
 5329 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5330                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5331   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5332             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5333              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5334             Matcher::vector_length(n->in(2)) == 2);
 5335   match(Set dst (MinReductionV src1 src2));
 5336   match(Set dst (MaxReductionV src1 src2));
 5337   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5338   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5339   ins_encode %{
 5340     assert(UseAVX > 0, "sanity");
 5341 
 5342     int opcode = this->ideal_Opcode();
 5343     int vlen = Matcher::vector_length(this, $src2);
 5344     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5345                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5346   %}
 5347   ins_pipe( pipe_slow );
 5348 %}
 5349 
 5350 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5351                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5352   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5353             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5354              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5355             Matcher::vector_length(n->in(2)) >= 4);
 5356   match(Set dst (MinReductionV src1 src2));
 5357   match(Set dst (MaxReductionV src1 src2));
 5358   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5359   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5360   ins_encode %{
 5361     assert(UseAVX > 0, "sanity");
 5362 
 5363     int opcode = this->ideal_Opcode();
 5364     int vlen = Matcher::vector_length(this, $src2);
 5365     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5366                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5367   %}
 5368   ins_pipe( pipe_slow );
 5369 %}
 5370 
 5371 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5372                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5373   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5374             Matcher::vector_length(n->in(2)) == 2);
 5375   match(Set dst (MinReductionV dst src));
 5376   match(Set dst (MaxReductionV dst src));
 5377   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5378   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5379   ins_encode %{
 5380     assert(UseAVX > 0, "sanity");
 5381 
 5382     int opcode = this->ideal_Opcode();
 5383     int vlen = Matcher::vector_length(this, $src);
 5384     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5385                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5386   %}
 5387   ins_pipe( pipe_slow );
 5388 %}
 5389 
 5390 
 5391 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5392                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5393   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5394             Matcher::vector_length(n->in(2)) >= 4);
 5395   match(Set dst (MinReductionV dst src));
 5396   match(Set dst (MaxReductionV dst src));
 5397   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5398   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5399   ins_encode %{
 5400     assert(UseAVX > 0, "sanity");
 5401 
 5402     int opcode = this->ideal_Opcode();
 5403     int vlen = Matcher::vector_length(this, $src);
 5404     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5405                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5406   %}
 5407   ins_pipe( pipe_slow );
 5408 %}
 5409 
 5410 
 5411 //--------------------Min Double Reduction --------------------
 5412 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5413                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5414                             rFlagsReg cr) %{
 5415   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5416             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5417              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5418             Matcher::vector_length(n->in(2)) == 2);
 5419   match(Set dst (MinReductionV src1 src2));
 5420   match(Set dst (MaxReductionV src1 src2));
 5421   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5422   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5423   ins_encode %{
 5424     assert(UseAVX > 0, "sanity");
 5425 
 5426     int opcode = this->ideal_Opcode();
 5427     int vlen = Matcher::vector_length(this, $src2);
 5428     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5429                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5430   %}
 5431   ins_pipe( pipe_slow );
 5432 %}
 5433 
 5434 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5435                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5436                            rFlagsReg cr) %{
 5437   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5438             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5439              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5440             Matcher::vector_length(n->in(2)) >= 4);
 5441   match(Set dst (MinReductionV src1 src2));
 5442   match(Set dst (MaxReductionV src1 src2));
 5443   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5444   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5445   ins_encode %{
 5446     assert(UseAVX > 0, "sanity");
 5447 
 5448     int opcode = this->ideal_Opcode();
 5449     int vlen = Matcher::vector_length(this, $src2);
 5450     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5451                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5452   %}
 5453   ins_pipe( pipe_slow );
 5454 %}
 5455 
 5456 
 5457 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5458                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5459                                rFlagsReg cr) %{
 5460   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5461             Matcher::vector_length(n->in(2)) == 2);
 5462   match(Set dst (MinReductionV dst src));
 5463   match(Set dst (MaxReductionV dst src));
 5464   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5465   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5466   ins_encode %{
 5467     assert(UseAVX > 0, "sanity");
 5468 
 5469     int opcode = this->ideal_Opcode();
 5470     int vlen = Matcher::vector_length(this, $src);
 5471     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5472                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5473   %}
 5474   ins_pipe( pipe_slow );
 5475 %}
 5476 
 5477 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5478                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5479                               rFlagsReg cr) %{
 5480   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5481             Matcher::vector_length(n->in(2)) >= 4);
 5482   match(Set dst (MinReductionV dst src));
 5483   match(Set dst (MaxReductionV dst src));
 5484   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5485   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5486   ins_encode %{
 5487     assert(UseAVX > 0, "sanity");
 5488 
 5489     int opcode = this->ideal_Opcode();
 5490     int vlen = Matcher::vector_length(this, $src);
 5491     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5492                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5493   %}
 5494   ins_pipe( pipe_slow );
 5495 %}
 5496 
 5497 // ====================VECTOR ARITHMETIC=======================================
 5498 
 5499 // --------------------------------- ADD --------------------------------------
 5500 
 5501 // Bytes vector add
 5502 instruct vaddB(vec dst, vec src) %{
 5503   predicate(UseAVX == 0);
 5504   match(Set dst (AddVB dst src));
 5505   format %{ "paddb   $dst,$src\t! add packedB" %}
 5506   ins_encode %{
 5507     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5508   %}
 5509   ins_pipe( pipe_slow );
 5510 %}
 5511 
 5512 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5513   predicate(UseAVX > 0);
 5514   match(Set dst (AddVB src1 src2));
 5515   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5516   ins_encode %{
 5517     int vlen_enc = vector_length_encoding(this);
 5518     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5519   %}
 5520   ins_pipe( pipe_slow );
 5521 %}
 5522 
 5523 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5524   predicate((UseAVX > 0) &&
 5525             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5526   match(Set dst (AddVB src (LoadVector mem)));
 5527   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5528   ins_encode %{
 5529     int vlen_enc = vector_length_encoding(this);
 5530     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5531   %}
 5532   ins_pipe( pipe_slow );
 5533 %}
 5534 
 5535 // Shorts/Chars vector add
 5536 instruct vaddS(vec dst, vec src) %{
 5537   predicate(UseAVX == 0);
 5538   match(Set dst (AddVS dst src));
 5539   format %{ "paddw   $dst,$src\t! add packedS" %}
 5540   ins_encode %{
 5541     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5542   %}
 5543   ins_pipe( pipe_slow );
 5544 %}
 5545 
 5546 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5547   predicate(UseAVX > 0);
 5548   match(Set dst (AddVS src1 src2));
 5549   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5550   ins_encode %{
 5551     int vlen_enc = vector_length_encoding(this);
 5552     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5553   %}
 5554   ins_pipe( pipe_slow );
 5555 %}
 5556 
 5557 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5558   predicate((UseAVX > 0) &&
 5559             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5560   match(Set dst (AddVS src (LoadVector mem)));
 5561   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5562   ins_encode %{
 5563     int vlen_enc = vector_length_encoding(this);
 5564     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5565   %}
 5566   ins_pipe( pipe_slow );
 5567 %}
 5568 
 5569 // Integers vector add
 5570 instruct vaddI(vec dst, vec src) %{
 5571   predicate(UseAVX == 0);
 5572   match(Set dst (AddVI dst src));
 5573   format %{ "paddd   $dst,$src\t! add packedI" %}
 5574   ins_encode %{
 5575     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5576   %}
 5577   ins_pipe( pipe_slow );
 5578 %}
 5579 
 5580 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5581   predicate(UseAVX > 0);
 5582   match(Set dst (AddVI src1 src2));
 5583   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5584   ins_encode %{
 5585     int vlen_enc = vector_length_encoding(this);
 5586     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5587   %}
 5588   ins_pipe( pipe_slow );
 5589 %}
 5590 
 5591 
 5592 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5593   predicate((UseAVX > 0) &&
 5594             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5595   match(Set dst (AddVI src (LoadVector mem)));
 5596   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5597   ins_encode %{
 5598     int vlen_enc = vector_length_encoding(this);
 5599     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5600   %}
 5601   ins_pipe( pipe_slow );
 5602 %}
 5603 
 5604 // Longs vector add
 5605 instruct vaddL(vec dst, vec src) %{
 5606   predicate(UseAVX == 0);
 5607   match(Set dst (AddVL dst src));
 5608   format %{ "paddq   $dst,$src\t! add packedL" %}
 5609   ins_encode %{
 5610     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5611   %}
 5612   ins_pipe( pipe_slow );
 5613 %}
 5614 
 5615 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5616   predicate(UseAVX > 0);
 5617   match(Set dst (AddVL src1 src2));
 5618   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5619   ins_encode %{
 5620     int vlen_enc = vector_length_encoding(this);
 5621     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5622   %}
 5623   ins_pipe( pipe_slow );
 5624 %}
 5625 
 5626 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5627   predicate((UseAVX > 0) &&
 5628             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5629   match(Set dst (AddVL src (LoadVector mem)));
 5630   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5631   ins_encode %{
 5632     int vlen_enc = vector_length_encoding(this);
 5633     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5634   %}
 5635   ins_pipe( pipe_slow );
 5636 %}
 5637 
 5638 // Floats vector add
 5639 instruct vaddF(vec dst, vec src) %{
 5640   predicate(UseAVX == 0);
 5641   match(Set dst (AddVF dst src));
 5642   format %{ "addps   $dst,$src\t! add packedF" %}
 5643   ins_encode %{
 5644     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5645   %}
 5646   ins_pipe( pipe_slow );
 5647 %}
 5648 
 5649 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5650   predicate(UseAVX > 0);
 5651   match(Set dst (AddVF src1 src2));
 5652   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5653   ins_encode %{
 5654     int vlen_enc = vector_length_encoding(this);
 5655     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5656   %}
 5657   ins_pipe( pipe_slow );
 5658 %}
 5659 
 5660 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5661   predicate((UseAVX > 0) &&
 5662             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5663   match(Set dst (AddVF src (LoadVector mem)));
 5664   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5665   ins_encode %{
 5666     int vlen_enc = vector_length_encoding(this);
 5667     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5668   %}
 5669   ins_pipe( pipe_slow );
 5670 %}
 5671 
 5672 // Doubles vector add
 5673 instruct vaddD(vec dst, vec src) %{
 5674   predicate(UseAVX == 0);
 5675   match(Set dst (AddVD dst src));
 5676   format %{ "addpd   $dst,$src\t! add packedD" %}
 5677   ins_encode %{
 5678     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5679   %}
 5680   ins_pipe( pipe_slow );
 5681 %}
 5682 
 5683 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5684   predicate(UseAVX > 0);
 5685   match(Set dst (AddVD src1 src2));
 5686   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5687   ins_encode %{
 5688     int vlen_enc = vector_length_encoding(this);
 5689     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5690   %}
 5691   ins_pipe( pipe_slow );
 5692 %}
 5693 
 5694 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5695   predicate((UseAVX > 0) &&
 5696             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5697   match(Set dst (AddVD src (LoadVector mem)));
 5698   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5699   ins_encode %{
 5700     int vlen_enc = vector_length_encoding(this);
 5701     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5702   %}
 5703   ins_pipe( pipe_slow );
 5704 %}
 5705 
 5706 // --------------------------------- SUB --------------------------------------
 5707 
 5708 // Bytes vector sub
 5709 instruct vsubB(vec dst, vec src) %{
 5710   predicate(UseAVX == 0);
 5711   match(Set dst (SubVB dst src));
 5712   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5713   ins_encode %{
 5714     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5715   %}
 5716   ins_pipe( pipe_slow );
 5717 %}
 5718 
 5719 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5720   predicate(UseAVX > 0);
 5721   match(Set dst (SubVB src1 src2));
 5722   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5723   ins_encode %{
 5724     int vlen_enc = vector_length_encoding(this);
 5725     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5726   %}
 5727   ins_pipe( pipe_slow );
 5728 %}
 5729 
 5730 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5731   predicate((UseAVX > 0) &&
 5732             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5733   match(Set dst (SubVB src (LoadVector mem)));
 5734   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5735   ins_encode %{
 5736     int vlen_enc = vector_length_encoding(this);
 5737     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5738   %}
 5739   ins_pipe( pipe_slow );
 5740 %}
 5741 
 5742 // Shorts/Chars vector sub
 5743 instruct vsubS(vec dst, vec src) %{
 5744   predicate(UseAVX == 0);
 5745   match(Set dst (SubVS dst src));
 5746   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5747   ins_encode %{
 5748     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5749   %}
 5750   ins_pipe( pipe_slow );
 5751 %}
 5752 
 5753 
 5754 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5755   predicate(UseAVX > 0);
 5756   match(Set dst (SubVS src1 src2));
 5757   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5758   ins_encode %{
 5759     int vlen_enc = vector_length_encoding(this);
 5760     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5761   %}
 5762   ins_pipe( pipe_slow );
 5763 %}
 5764 
 5765 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5766   predicate((UseAVX > 0) &&
 5767             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5768   match(Set dst (SubVS src (LoadVector mem)));
 5769   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5770   ins_encode %{
 5771     int vlen_enc = vector_length_encoding(this);
 5772     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5773   %}
 5774   ins_pipe( pipe_slow );
 5775 %}
 5776 
 5777 // Integers vector sub
 5778 instruct vsubI(vec dst, vec src) %{
 5779   predicate(UseAVX == 0);
 5780   match(Set dst (SubVI dst src));
 5781   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5782   ins_encode %{
 5783     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5784   %}
 5785   ins_pipe( pipe_slow );
 5786 %}
 5787 
 5788 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5789   predicate(UseAVX > 0);
 5790   match(Set dst (SubVI src1 src2));
 5791   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5792   ins_encode %{
 5793     int vlen_enc = vector_length_encoding(this);
 5794     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5795   %}
 5796   ins_pipe( pipe_slow );
 5797 %}
 5798 
 5799 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5800   predicate((UseAVX > 0) &&
 5801             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5802   match(Set dst (SubVI src (LoadVector mem)));
 5803   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5804   ins_encode %{
 5805     int vlen_enc = vector_length_encoding(this);
 5806     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5807   %}
 5808   ins_pipe( pipe_slow );
 5809 %}
 5810 
 5811 // Longs vector sub
 5812 instruct vsubL(vec dst, vec src) %{
 5813   predicate(UseAVX == 0);
 5814   match(Set dst (SubVL dst src));
 5815   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5816   ins_encode %{
 5817     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5818   %}
 5819   ins_pipe( pipe_slow );
 5820 %}
 5821 
 5822 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5823   predicate(UseAVX > 0);
 5824   match(Set dst (SubVL src1 src2));
 5825   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5826   ins_encode %{
 5827     int vlen_enc = vector_length_encoding(this);
 5828     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5829   %}
 5830   ins_pipe( pipe_slow );
 5831 %}
 5832 
 5833 
 5834 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5835   predicate((UseAVX > 0) &&
 5836             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5837   match(Set dst (SubVL src (LoadVector mem)));
 5838   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5839   ins_encode %{
 5840     int vlen_enc = vector_length_encoding(this);
 5841     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5842   %}
 5843   ins_pipe( pipe_slow );
 5844 %}
 5845 
 5846 // Floats vector sub
 5847 instruct vsubF(vec dst, vec src) %{
 5848   predicate(UseAVX == 0);
 5849   match(Set dst (SubVF dst src));
 5850   format %{ "subps   $dst,$src\t! sub packedF" %}
 5851   ins_encode %{
 5852     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5853   %}
 5854   ins_pipe( pipe_slow );
 5855 %}
 5856 
 5857 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5858   predicate(UseAVX > 0);
 5859   match(Set dst (SubVF src1 src2));
 5860   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5861   ins_encode %{
 5862     int vlen_enc = vector_length_encoding(this);
 5863     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5864   %}
 5865   ins_pipe( pipe_slow );
 5866 %}
 5867 
 5868 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5869   predicate((UseAVX > 0) &&
 5870             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5871   match(Set dst (SubVF src (LoadVector mem)));
 5872   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5873   ins_encode %{
 5874     int vlen_enc = vector_length_encoding(this);
 5875     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5876   %}
 5877   ins_pipe( pipe_slow );
 5878 %}
 5879 
 5880 // Doubles vector sub
 5881 instruct vsubD(vec dst, vec src) %{
 5882   predicate(UseAVX == 0);
 5883   match(Set dst (SubVD dst src));
 5884   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5885   ins_encode %{
 5886     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5887   %}
 5888   ins_pipe( pipe_slow );
 5889 %}
 5890 
 5891 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5892   predicate(UseAVX > 0);
 5893   match(Set dst (SubVD src1 src2));
 5894   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5895   ins_encode %{
 5896     int vlen_enc = vector_length_encoding(this);
 5897     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5898   %}
 5899   ins_pipe( pipe_slow );
 5900 %}
 5901 
 5902 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5903   predicate((UseAVX > 0) &&
 5904             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5905   match(Set dst (SubVD src (LoadVector mem)));
 5906   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5907   ins_encode %{
 5908     int vlen_enc = vector_length_encoding(this);
 5909     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5910   %}
 5911   ins_pipe( pipe_slow );
 5912 %}
 5913 
 5914 // --------------------------------- MUL --------------------------------------
 5915 
 5916 // Byte vector mul
 5917 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5918   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5919   match(Set dst (MulVB src1 src2));
 5920   effect(TEMP dst, TEMP xtmp);
 5921   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5922   ins_encode %{
 5923     assert(UseSSE > 3, "required");
 5924     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5925     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5926     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5927     __ psllw($dst$$XMMRegister, 8);
 5928     __ psrlw($dst$$XMMRegister, 8);
 5929     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5930   %}
 5931   ins_pipe( pipe_slow );
 5932 %}
 5933 
 5934 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5935   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5936   match(Set dst (MulVB src1 src2));
 5937   effect(TEMP dst, TEMP xtmp);
 5938   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5939   ins_encode %{
 5940     assert(UseSSE > 3, "required");
 5941     // Odd-index elements
 5942     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5943     __ psrlw($dst$$XMMRegister, 8);
 5944     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5945     __ psrlw($xtmp$$XMMRegister, 8);
 5946     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5947     __ psllw($dst$$XMMRegister, 8);
 5948     // Even-index elements
 5949     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5950     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5951     __ psllw($xtmp$$XMMRegister, 8);
 5952     __ psrlw($xtmp$$XMMRegister, 8);
 5953     // Combine
 5954     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5955   %}
 5956   ins_pipe( pipe_slow );
 5957 %}
 5958 
 5959 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5960   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5961   match(Set dst (MulVB src1 src2));
 5962   effect(TEMP xtmp1, TEMP xtmp2);
 5963   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5964   ins_encode %{
 5965     int vlen_enc = vector_length_encoding(this);
 5966     // Odd-index elements
 5967     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5968     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5969     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5970     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5971     // Even-index elements
 5972     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5973     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5974     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5975     // Combine
 5976     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5977   %}
 5978   ins_pipe( pipe_slow );
 5979 %}
 5980 
 5981 // Shorts/Chars vector mul
 5982 instruct vmulS(vec dst, vec src) %{
 5983   predicate(UseAVX == 0);
 5984   match(Set dst (MulVS dst src));
 5985   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5986   ins_encode %{
 5987     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5988   %}
 5989   ins_pipe( pipe_slow );
 5990 %}
 5991 
 5992 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5993   predicate(UseAVX > 0);
 5994   match(Set dst (MulVS src1 src2));
 5995   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5996   ins_encode %{
 5997     int vlen_enc = vector_length_encoding(this);
 5998     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5999   %}
 6000   ins_pipe( pipe_slow );
 6001 %}
 6002 
 6003 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6004   predicate((UseAVX > 0) &&
 6005             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6006   match(Set dst (MulVS src (LoadVector mem)));
 6007   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6008   ins_encode %{
 6009     int vlen_enc = vector_length_encoding(this);
 6010     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6011   %}
 6012   ins_pipe( pipe_slow );
 6013 %}
 6014 
 6015 // Integers vector mul
 6016 instruct vmulI(vec dst, vec src) %{
 6017   predicate(UseAVX == 0);
 6018   match(Set dst (MulVI dst src));
 6019   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6020   ins_encode %{
 6021     assert(UseSSE > 3, "required");
 6022     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6023   %}
 6024   ins_pipe( pipe_slow );
 6025 %}
 6026 
 6027 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6028   predicate(UseAVX > 0);
 6029   match(Set dst (MulVI src1 src2));
 6030   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6031   ins_encode %{
 6032     int vlen_enc = vector_length_encoding(this);
 6033     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6034   %}
 6035   ins_pipe( pipe_slow );
 6036 %}
 6037 
 6038 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6039   predicate((UseAVX > 0) &&
 6040             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6041   match(Set dst (MulVI src (LoadVector mem)));
 6042   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6043   ins_encode %{
 6044     int vlen_enc = vector_length_encoding(this);
 6045     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6046   %}
 6047   ins_pipe( pipe_slow );
 6048 %}
 6049 
 6050 // Longs vector mul
 6051 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6052   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6053              VM_Version::supports_avx512dq()) ||
 6054             VM_Version::supports_avx512vldq());
 6055   match(Set dst (MulVL src1 src2));
 6056   ins_cost(500);
 6057   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6058   ins_encode %{
 6059     assert(UseAVX > 2, "required");
 6060     int vlen_enc = vector_length_encoding(this);
 6061     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6062   %}
 6063   ins_pipe( pipe_slow );
 6064 %}
 6065 
 6066 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6067   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6068              VM_Version::supports_avx512dq()) ||
 6069             (Matcher::vector_length_in_bytes(n) > 8 &&
 6070              VM_Version::supports_avx512vldq()));
 6071   match(Set dst (MulVL src (LoadVector mem)));
 6072   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6073   ins_cost(500);
 6074   ins_encode %{
 6075     assert(UseAVX > 2, "required");
 6076     int vlen_enc = vector_length_encoding(this);
 6077     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6078   %}
 6079   ins_pipe( pipe_slow );
 6080 %}
 6081 
 6082 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6083   predicate(UseAVX == 0);
 6084   match(Set dst (MulVL src1 src2));
 6085   ins_cost(500);
 6086   effect(TEMP dst, TEMP xtmp);
 6087   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6088   ins_encode %{
 6089     assert(VM_Version::supports_sse4_1(), "required");
 6090     // Get the lo-hi products, only the lower 32 bits is in concerns
 6091     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6092     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6093     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6094     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6095     __ psllq($dst$$XMMRegister, 32);
 6096     // Get the lo-lo products
 6097     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6098     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6099     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6100   %}
 6101   ins_pipe( pipe_slow );
 6102 %}
 6103 
 6104 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6105   predicate(UseAVX > 0 &&
 6106             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6107               !VM_Version::supports_avx512dq()) ||
 6108              (Matcher::vector_length_in_bytes(n) < 64 &&
 6109               !VM_Version::supports_avx512vldq())));
 6110   match(Set dst (MulVL src1 src2));
 6111   effect(TEMP xtmp1, TEMP xtmp2);
 6112   ins_cost(500);
 6113   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6114   ins_encode %{
 6115     int vlen_enc = vector_length_encoding(this);
 6116     // Get the lo-hi products, only the lower 32 bits is in concerns
 6117     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6118     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6119     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6120     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6121     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6122     // Get the lo-lo products
 6123     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6124     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6125   %}
 6126   ins_pipe( pipe_slow );
 6127 %}
 6128 
 6129 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6130   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6131   match(Set dst (MulVL src1 src2));
 6132   ins_cost(100);
 6133   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6134   ins_encode %{
 6135     int vlen_enc = vector_length_encoding(this);
 6136     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6137   %}
 6138   ins_pipe( pipe_slow );
 6139 %}
 6140 
 6141 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6142   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6143   match(Set dst (MulVL src1 src2));
 6144   ins_cost(100);
 6145   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6146   ins_encode %{
 6147     int vlen_enc = vector_length_encoding(this);
 6148     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6149   %}
 6150   ins_pipe( pipe_slow );
 6151 %}
 6152 
 6153 // Floats vector mul
 6154 instruct vmulF(vec dst, vec src) %{
 6155   predicate(UseAVX == 0);
 6156   match(Set dst (MulVF dst src));
 6157   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6158   ins_encode %{
 6159     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6160   %}
 6161   ins_pipe( pipe_slow );
 6162 %}
 6163 
 6164 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6165   predicate(UseAVX > 0);
 6166   match(Set dst (MulVF src1 src2));
 6167   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6168   ins_encode %{
 6169     int vlen_enc = vector_length_encoding(this);
 6170     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6171   %}
 6172   ins_pipe( pipe_slow );
 6173 %}
 6174 
 6175 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6176   predicate((UseAVX > 0) &&
 6177             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6178   match(Set dst (MulVF src (LoadVector mem)));
 6179   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6180   ins_encode %{
 6181     int vlen_enc = vector_length_encoding(this);
 6182     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6183   %}
 6184   ins_pipe( pipe_slow );
 6185 %}
 6186 
 6187 // Doubles vector mul
 6188 instruct vmulD(vec dst, vec src) %{
 6189   predicate(UseAVX == 0);
 6190   match(Set dst (MulVD dst src));
 6191   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6192   ins_encode %{
 6193     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6194   %}
 6195   ins_pipe( pipe_slow );
 6196 %}
 6197 
 6198 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6199   predicate(UseAVX > 0);
 6200   match(Set dst (MulVD src1 src2));
 6201   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6202   ins_encode %{
 6203     int vlen_enc = vector_length_encoding(this);
 6204     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6205   %}
 6206   ins_pipe( pipe_slow );
 6207 %}
 6208 
 6209 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6210   predicate((UseAVX > 0) &&
 6211             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6212   match(Set dst (MulVD src (LoadVector mem)));
 6213   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6214   ins_encode %{
 6215     int vlen_enc = vector_length_encoding(this);
 6216     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6217   %}
 6218   ins_pipe( pipe_slow );
 6219 %}
 6220 
 6221 // --------------------------------- DIV --------------------------------------
 6222 
 6223 // Floats vector div
 6224 instruct vdivF(vec dst, vec src) %{
 6225   predicate(UseAVX == 0);
 6226   match(Set dst (DivVF dst src));
 6227   format %{ "divps   $dst,$src\t! div packedF" %}
 6228   ins_encode %{
 6229     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6230   %}
 6231   ins_pipe( pipe_slow );
 6232 %}
 6233 
 6234 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6235   predicate(UseAVX > 0);
 6236   match(Set dst (DivVF src1 src2));
 6237   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6238   ins_encode %{
 6239     int vlen_enc = vector_length_encoding(this);
 6240     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6241   %}
 6242   ins_pipe( pipe_slow );
 6243 %}
 6244 
 6245 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6246   predicate((UseAVX > 0) &&
 6247             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6248   match(Set dst (DivVF src (LoadVector mem)));
 6249   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6250   ins_encode %{
 6251     int vlen_enc = vector_length_encoding(this);
 6252     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6253   %}
 6254   ins_pipe( pipe_slow );
 6255 %}
 6256 
 6257 // Doubles vector div
 6258 instruct vdivD(vec dst, vec src) %{
 6259   predicate(UseAVX == 0);
 6260   match(Set dst (DivVD dst src));
 6261   format %{ "divpd   $dst,$src\t! div packedD" %}
 6262   ins_encode %{
 6263     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6264   %}
 6265   ins_pipe( pipe_slow );
 6266 %}
 6267 
 6268 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6269   predicate(UseAVX > 0);
 6270   match(Set dst (DivVD src1 src2));
 6271   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6272   ins_encode %{
 6273     int vlen_enc = vector_length_encoding(this);
 6274     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6275   %}
 6276   ins_pipe( pipe_slow );
 6277 %}
 6278 
 6279 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6280   predicate((UseAVX > 0) &&
 6281             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6282   match(Set dst (DivVD src (LoadVector mem)));
 6283   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6284   ins_encode %{
 6285     int vlen_enc = vector_length_encoding(this);
 6286     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6287   %}
 6288   ins_pipe( pipe_slow );
 6289 %}
 6290 
 6291 // ------------------------------ MinMax ---------------------------------------
 6292 
 6293 // Byte, Short, Int vector Min/Max
 6294 instruct minmax_reg_sse(vec dst, vec src) %{
 6295   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6296             UseAVX == 0);
 6297   match(Set dst (MinV dst src));
 6298   match(Set dst (MaxV dst src));
 6299   format %{ "vector_minmax  $dst,$src\t!  " %}
 6300   ins_encode %{
 6301     assert(UseSSE >= 4, "required");
 6302 
 6303     int opcode = this->ideal_Opcode();
 6304     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6305     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6306   %}
 6307   ins_pipe( pipe_slow );
 6308 %}
 6309 
 6310 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6311   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6312             UseAVX > 0);
 6313   match(Set dst (MinV src1 src2));
 6314   match(Set dst (MaxV src1 src2));
 6315   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6316   ins_encode %{
 6317     int opcode = this->ideal_Opcode();
 6318     int vlen_enc = vector_length_encoding(this);
 6319     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6320 
 6321     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6322   %}
 6323   ins_pipe( pipe_slow );
 6324 %}
 6325 
 6326 // Long vector Min/Max
 6327 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6328   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6329             UseAVX == 0);
 6330   match(Set dst (MinV dst src));
 6331   match(Set dst (MaxV src dst));
 6332   effect(TEMP dst, TEMP tmp);
 6333   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6334   ins_encode %{
 6335     assert(UseSSE >= 4, "required");
 6336 
 6337     int opcode = this->ideal_Opcode();
 6338     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6339     assert(elem_bt == T_LONG, "sanity");
 6340 
 6341     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6342   %}
 6343   ins_pipe( pipe_slow );
 6344 %}
 6345 
 6346 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6347   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6348             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6349   match(Set dst (MinV src1 src2));
 6350   match(Set dst (MaxV src1 src2));
 6351   effect(TEMP dst);
 6352   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6353   ins_encode %{
 6354     int vlen_enc = vector_length_encoding(this);
 6355     int opcode = this->ideal_Opcode();
 6356     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6357     assert(elem_bt == T_LONG, "sanity");
 6358 
 6359     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6360   %}
 6361   ins_pipe( pipe_slow );
 6362 %}
 6363 
 6364 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6365   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6366             Matcher::vector_element_basic_type(n) == T_LONG);
 6367   match(Set dst (MinV src1 src2));
 6368   match(Set dst (MaxV src1 src2));
 6369   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6370   ins_encode %{
 6371     assert(UseAVX > 2, "required");
 6372 
 6373     int vlen_enc = vector_length_encoding(this);
 6374     int opcode = this->ideal_Opcode();
 6375     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6376     assert(elem_bt == T_LONG, "sanity");
 6377 
 6378     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6379   %}
 6380   ins_pipe( pipe_slow );
 6381 %}
 6382 
 6383 // Float/Double vector Min/Max
 6384 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6385   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6386             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6387             UseAVX > 0);
 6388   match(Set dst (MinV a b));
 6389   match(Set dst (MaxV a b));
 6390   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6391   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6392   ins_encode %{
 6393     assert(UseAVX > 0, "required");
 6394 
 6395     int opcode = this->ideal_Opcode();
 6396     int vlen_enc = vector_length_encoding(this);
 6397     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6398 
 6399     __ vminmax_fp(opcode, elem_bt,
 6400                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6401                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6402   %}
 6403   ins_pipe( pipe_slow );
 6404 %}
 6405 
 6406 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6407   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6408             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6409   match(Set dst (MinV a b));
 6410   match(Set dst (MaxV a b));
 6411   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6412   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6413   ins_encode %{
 6414     assert(UseAVX > 2, "required");
 6415 
 6416     int opcode = this->ideal_Opcode();
 6417     int vlen_enc = vector_length_encoding(this);
 6418     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6419 
 6420     __ evminmax_fp(opcode, elem_bt,
 6421                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6422                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6423   %}
 6424   ins_pipe( pipe_slow );
 6425 %}
 6426 
 6427 // ------------------------------ Unsigned vector Min/Max ----------------------
 6428 
 6429 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6430   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6431   match(Set dst (UMinV a b));
 6432   match(Set dst (UMaxV a b));
 6433   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6434   ins_encode %{
 6435     int opcode = this->ideal_Opcode();
 6436     int vlen_enc = vector_length_encoding(this);
 6437     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6438     assert(is_integral_type(elem_bt), "");
 6439     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6440   %}
 6441   ins_pipe( pipe_slow );
 6442 %}
 6443 
 6444 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6445   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6446   match(Set dst (UMinV a (LoadVector b)));
 6447   match(Set dst (UMaxV a (LoadVector b)));
 6448   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6449   ins_encode %{
 6450     int opcode = this->ideal_Opcode();
 6451     int vlen_enc = vector_length_encoding(this);
 6452     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6453     assert(is_integral_type(elem_bt), "");
 6454     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6455   %}
 6456   ins_pipe( pipe_slow );
 6457 %}
 6458 
 6459 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6460   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6461   match(Set dst (UMinV a b));
 6462   match(Set dst (UMaxV a b));
 6463   effect(TEMP xtmp1, TEMP xtmp2);
 6464   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6465   ins_encode %{
 6466     int opcode = this->ideal_Opcode();
 6467     int vlen_enc = vector_length_encoding(this);
 6468     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6469   %}
 6470   ins_pipe( pipe_slow );
 6471 %}
 6472 
 6473 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6474   match(Set dst (UMinV (Binary dst src2) mask));
 6475   match(Set dst (UMaxV (Binary dst src2) mask));
 6476   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6477   ins_encode %{
 6478     int vlen_enc = vector_length_encoding(this);
 6479     BasicType bt = Matcher::vector_element_basic_type(this);
 6480     int opc = this->ideal_Opcode();
 6481     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6482                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6483   %}
 6484   ins_pipe( pipe_slow );
 6485 %}
 6486 
 6487 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6488   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6489   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6490   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6491   ins_encode %{
 6492     int vlen_enc = vector_length_encoding(this);
 6493     BasicType bt = Matcher::vector_element_basic_type(this);
 6494     int opc = this->ideal_Opcode();
 6495     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6496                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6497   %}
 6498   ins_pipe( pipe_slow );
 6499 %}
 6500 
 6501 // --------------------------------- Signum/CopySign ---------------------------
 6502 
 6503 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6504   match(Set dst (SignumF dst (Binary zero one)));
 6505   effect(KILL cr);
 6506   format %{ "signumF $dst, $dst" %}
 6507   ins_encode %{
 6508     int opcode = this->ideal_Opcode();
 6509     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6510   %}
 6511   ins_pipe( pipe_slow );
 6512 %}
 6513 
 6514 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6515   match(Set dst (SignumD dst (Binary zero one)));
 6516   effect(KILL cr);
 6517   format %{ "signumD $dst, $dst" %}
 6518   ins_encode %{
 6519     int opcode = this->ideal_Opcode();
 6520     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6521   %}
 6522   ins_pipe( pipe_slow );
 6523 %}
 6524 
 6525 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6526   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6527   match(Set dst (SignumVF src (Binary zero one)));
 6528   match(Set dst (SignumVD src (Binary zero one)));
 6529   effect(TEMP dst, TEMP xtmp1);
 6530   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6531   ins_encode %{
 6532     int opcode = this->ideal_Opcode();
 6533     int vec_enc = vector_length_encoding(this);
 6534     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6535                          $xtmp1$$XMMRegister, vec_enc);
 6536   %}
 6537   ins_pipe( pipe_slow );
 6538 %}
 6539 
 6540 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6541   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6542   match(Set dst (SignumVF src (Binary zero one)));
 6543   match(Set dst (SignumVD src (Binary zero one)));
 6544   effect(TEMP dst, TEMP ktmp1);
 6545   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6546   ins_encode %{
 6547     int opcode = this->ideal_Opcode();
 6548     int vec_enc = vector_length_encoding(this);
 6549     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6550                           $ktmp1$$KRegister, vec_enc);
 6551   %}
 6552   ins_pipe( pipe_slow );
 6553 %}
 6554 
 6555 // ---------------------------------------
 6556 // For copySign use 0xE4 as writemask for vpternlog
 6557 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6558 // C (xmm2) is set to 0x7FFFFFFF
 6559 // Wherever xmm2 is 0, we want to pick from B (sign)
 6560 // Wherever xmm2 is 1, we want to pick from A (src)
 6561 //
 6562 // A B C Result
 6563 // 0 0 0 0
 6564 // 0 0 1 0
 6565 // 0 1 0 1
 6566 // 0 1 1 0
 6567 // 1 0 0 0
 6568 // 1 0 1 1
 6569 // 1 1 0 1
 6570 // 1 1 1 1
 6571 //
 6572 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6573 // ---------------------------------------
 6574 
 6575 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6576   match(Set dst (CopySignF dst src));
 6577   effect(TEMP tmp1, TEMP tmp2);
 6578   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6579   ins_encode %{
 6580     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6581     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6582     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6583   %}
 6584   ins_pipe( pipe_slow );
 6585 %}
 6586 
 6587 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6588   match(Set dst (CopySignD dst (Binary src zero)));
 6589   ins_cost(100);
 6590   effect(TEMP tmp1, TEMP tmp2);
 6591   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6592   ins_encode %{
 6593     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6594     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6595     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6596   %}
 6597   ins_pipe( pipe_slow );
 6598 %}
 6599 
 6600 //----------------------------- CompressBits/ExpandBits ------------------------
 6601 
 6602 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6603   predicate(n->bottom_type()->isa_int());
 6604   match(Set dst (CompressBits src mask));
 6605   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6606   ins_encode %{
 6607     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6608   %}
 6609   ins_pipe( pipe_slow );
 6610 %}
 6611 
 6612 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6613   predicate(n->bottom_type()->isa_int());
 6614   match(Set dst (ExpandBits src mask));
 6615   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6616   ins_encode %{
 6617     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6618   %}
 6619   ins_pipe( pipe_slow );
 6620 %}
 6621 
 6622 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6623   predicate(n->bottom_type()->isa_int());
 6624   match(Set dst (CompressBits src (LoadI mask)));
 6625   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6626   ins_encode %{
 6627     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6628   %}
 6629   ins_pipe( pipe_slow );
 6630 %}
 6631 
 6632 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6633   predicate(n->bottom_type()->isa_int());
 6634   match(Set dst (ExpandBits src (LoadI mask)));
 6635   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6636   ins_encode %{
 6637     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6638   %}
 6639   ins_pipe( pipe_slow );
 6640 %}
 6641 
 6642 // --------------------------------- Sqrt --------------------------------------
 6643 
 6644 instruct vsqrtF_reg(vec dst, vec src) %{
 6645   match(Set dst (SqrtVF src));
 6646   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6647   ins_encode %{
 6648     assert(UseAVX > 0, "required");
 6649     int vlen_enc = vector_length_encoding(this);
 6650     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6651   %}
 6652   ins_pipe( pipe_slow );
 6653 %}
 6654 
 6655 instruct vsqrtF_mem(vec dst, memory mem) %{
 6656   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6657   match(Set dst (SqrtVF (LoadVector mem)));
 6658   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6659   ins_encode %{
 6660     assert(UseAVX > 0, "required");
 6661     int vlen_enc = vector_length_encoding(this);
 6662     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6663   %}
 6664   ins_pipe( pipe_slow );
 6665 %}
 6666 
 6667 // Floating point vector sqrt
 6668 instruct vsqrtD_reg(vec dst, vec src) %{
 6669   match(Set dst (SqrtVD src));
 6670   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6671   ins_encode %{
 6672     assert(UseAVX > 0, "required");
 6673     int vlen_enc = vector_length_encoding(this);
 6674     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6675   %}
 6676   ins_pipe( pipe_slow );
 6677 %}
 6678 
 6679 instruct vsqrtD_mem(vec dst, memory mem) %{
 6680   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6681   match(Set dst (SqrtVD (LoadVector mem)));
 6682   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6683   ins_encode %{
 6684     assert(UseAVX > 0, "required");
 6685     int vlen_enc = vector_length_encoding(this);
 6686     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6687   %}
 6688   ins_pipe( pipe_slow );
 6689 %}
 6690 
 6691 // ------------------------------ Shift ---------------------------------------
 6692 
 6693 // Left and right shift count vectors are the same on x86
 6694 // (only lowest bits of xmm reg are used for count).
 6695 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6696   match(Set dst (LShiftCntV cnt));
 6697   match(Set dst (RShiftCntV cnt));
 6698   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6699   ins_encode %{
 6700     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6701   %}
 6702   ins_pipe( pipe_slow );
 6703 %}
 6704 
 6705 // Byte vector shift
 6706 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6707   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6708   match(Set dst ( LShiftVB src shift));
 6709   match(Set dst ( RShiftVB src shift));
 6710   match(Set dst (URShiftVB src shift));
 6711   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6712   format %{"vector_byte_shift $dst,$src,$shift" %}
 6713   ins_encode %{
 6714     assert(UseSSE > 3, "required");
 6715     int opcode = this->ideal_Opcode();
 6716     bool sign = (opcode != Op_URShiftVB);
 6717     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6718     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6719     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6720     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6721     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6722   %}
 6723   ins_pipe( pipe_slow );
 6724 %}
 6725 
 6726 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6727   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6728             UseAVX <= 1);
 6729   match(Set dst ( LShiftVB src shift));
 6730   match(Set dst ( RShiftVB src shift));
 6731   match(Set dst (URShiftVB src shift));
 6732   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6733   format %{"vector_byte_shift $dst,$src,$shift" %}
 6734   ins_encode %{
 6735     assert(UseSSE > 3, "required");
 6736     int opcode = this->ideal_Opcode();
 6737     bool sign = (opcode != Op_URShiftVB);
 6738     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6739     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6740     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6741     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6742     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6743     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6744     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6745     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6746     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6747   %}
 6748   ins_pipe( pipe_slow );
 6749 %}
 6750 
 6751 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6752   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6753             UseAVX > 1);
 6754   match(Set dst ( LShiftVB src shift));
 6755   match(Set dst ( RShiftVB src shift));
 6756   match(Set dst (URShiftVB src shift));
 6757   effect(TEMP dst, TEMP tmp);
 6758   format %{"vector_byte_shift $dst,$src,$shift" %}
 6759   ins_encode %{
 6760     int opcode = this->ideal_Opcode();
 6761     bool sign = (opcode != Op_URShiftVB);
 6762     int vlen_enc = Assembler::AVX_256bit;
 6763     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6764     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6765     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6766     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6767     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6768   %}
 6769   ins_pipe( pipe_slow );
 6770 %}
 6771 
 6772 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6773   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6774   match(Set dst ( LShiftVB src shift));
 6775   match(Set dst ( RShiftVB src shift));
 6776   match(Set dst (URShiftVB src shift));
 6777   effect(TEMP dst, TEMP tmp);
 6778   format %{"vector_byte_shift $dst,$src,$shift" %}
 6779   ins_encode %{
 6780     assert(UseAVX > 1, "required");
 6781     int opcode = this->ideal_Opcode();
 6782     bool sign = (opcode != Op_URShiftVB);
 6783     int vlen_enc = Assembler::AVX_256bit;
 6784     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6785     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6786     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6787     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6788     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6789     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6790     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6791     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6792     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6793   %}
 6794   ins_pipe( pipe_slow );
 6795 %}
 6796 
 6797 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6798   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6799   match(Set dst ( LShiftVB src shift));
 6800   match(Set dst  (RShiftVB src shift));
 6801   match(Set dst (URShiftVB src shift));
 6802   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6803   format %{"vector_byte_shift $dst,$src,$shift" %}
 6804   ins_encode %{
 6805     assert(UseAVX > 2, "required");
 6806     int opcode = this->ideal_Opcode();
 6807     bool sign = (opcode != Op_URShiftVB);
 6808     int vlen_enc = Assembler::AVX_512bit;
 6809     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6810     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6811     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6812     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6813     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6814     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6815     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6816     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6817     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6818     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6819     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6820     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6821   %}
 6822   ins_pipe( pipe_slow );
 6823 %}
 6824 
 6825 // Shorts vector logical right shift produces incorrect Java result
 6826 // for negative data because java code convert short value into int with
 6827 // sign extension before a shift. But char vectors are fine since chars are
 6828 // unsigned values.
 6829 // Shorts/Chars vector left shift
 6830 instruct vshiftS(vec dst, vec src, vec shift) %{
 6831   predicate(!n->as_ShiftV()->is_var_shift());
 6832   match(Set dst ( LShiftVS src shift));
 6833   match(Set dst ( RShiftVS src shift));
 6834   match(Set dst (URShiftVS src shift));
 6835   effect(TEMP dst, USE src, USE shift);
 6836   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6837   ins_encode %{
 6838     int opcode = this->ideal_Opcode();
 6839     if (UseAVX > 0) {
 6840       int vlen_enc = vector_length_encoding(this);
 6841       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6842     } else {
 6843       int vlen = Matcher::vector_length(this);
 6844       if (vlen == 2) {
 6845         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6846         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6847       } else if (vlen == 4) {
 6848         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6849         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6850       } else {
 6851         assert (vlen == 8, "sanity");
 6852         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6853         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6854       }
 6855     }
 6856   %}
 6857   ins_pipe( pipe_slow );
 6858 %}
 6859 
 6860 // Integers vector left shift
 6861 instruct vshiftI(vec dst, vec src, vec shift) %{
 6862   predicate(!n->as_ShiftV()->is_var_shift());
 6863   match(Set dst ( LShiftVI src shift));
 6864   match(Set dst ( RShiftVI src shift));
 6865   match(Set dst (URShiftVI src shift));
 6866   effect(TEMP dst, USE src, USE shift);
 6867   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6868   ins_encode %{
 6869     int opcode = this->ideal_Opcode();
 6870     if (UseAVX > 0) {
 6871       int vlen_enc = vector_length_encoding(this);
 6872       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6873     } else {
 6874       int vlen = Matcher::vector_length(this);
 6875       if (vlen == 2) {
 6876         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6877         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6878       } else {
 6879         assert(vlen == 4, "sanity");
 6880         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6881         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6882       }
 6883     }
 6884   %}
 6885   ins_pipe( pipe_slow );
 6886 %}
 6887 
 6888 // Integers vector left constant shift
 6889 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6890   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6891   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6892   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6893   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6894   ins_encode %{
 6895     int opcode = this->ideal_Opcode();
 6896     if (UseAVX > 0) {
 6897       int vector_len = vector_length_encoding(this);
 6898       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6899     } else {
 6900       int vlen = Matcher::vector_length(this);
 6901       if (vlen == 2) {
 6902         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6903         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6904       } else {
 6905         assert(vlen == 4, "sanity");
 6906         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6907         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6908       }
 6909     }
 6910   %}
 6911   ins_pipe( pipe_slow );
 6912 %}
 6913 
 6914 // Longs vector shift
 6915 instruct vshiftL(vec dst, vec src, vec shift) %{
 6916   predicate(!n->as_ShiftV()->is_var_shift());
 6917   match(Set dst ( LShiftVL src shift));
 6918   match(Set dst (URShiftVL src shift));
 6919   effect(TEMP dst, USE src, USE shift);
 6920   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6921   ins_encode %{
 6922     int opcode = this->ideal_Opcode();
 6923     if (UseAVX > 0) {
 6924       int vlen_enc = vector_length_encoding(this);
 6925       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6926     } else {
 6927       assert(Matcher::vector_length(this) == 2, "");
 6928       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6929       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6930     }
 6931   %}
 6932   ins_pipe( pipe_slow );
 6933 %}
 6934 
 6935 // Longs vector constant shift
 6936 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6937   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6938   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6939   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6940   ins_encode %{
 6941     int opcode = this->ideal_Opcode();
 6942     if (UseAVX > 0) {
 6943       int vector_len = vector_length_encoding(this);
 6944       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6945     } else {
 6946       assert(Matcher::vector_length(this) == 2, "");
 6947       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6948       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6949     }
 6950   %}
 6951   ins_pipe( pipe_slow );
 6952 %}
 6953 
 6954 // -------------------ArithmeticRightShift -----------------------------------
 6955 // Long vector arithmetic right shift
 6956 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6957   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6958   match(Set dst (RShiftVL src shift));
 6959   effect(TEMP dst, TEMP tmp);
 6960   format %{ "vshiftq $dst,$src,$shift" %}
 6961   ins_encode %{
 6962     uint vlen = Matcher::vector_length(this);
 6963     if (vlen == 2) {
 6964       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6965       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6966       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6967       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6968       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6969       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6970     } else {
 6971       assert(vlen == 4, "sanity");
 6972       assert(UseAVX > 1, "required");
 6973       int vlen_enc = Assembler::AVX_256bit;
 6974       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6975       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6976       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6977       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6978       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6979     }
 6980   %}
 6981   ins_pipe( pipe_slow );
 6982 %}
 6983 
 6984 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6985   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6986   match(Set dst (RShiftVL src shift));
 6987   format %{ "vshiftq $dst,$src,$shift" %}
 6988   ins_encode %{
 6989     int vlen_enc = vector_length_encoding(this);
 6990     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6991   %}
 6992   ins_pipe( pipe_slow );
 6993 %}
 6994 
 6995 // ------------------- Variable Shift -----------------------------
 6996 // Byte variable shift
 6997 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6998   predicate(Matcher::vector_length(n) <= 8 &&
 6999             n->as_ShiftV()->is_var_shift() &&
 7000             !VM_Version::supports_avx512bw());
 7001   match(Set dst ( LShiftVB src shift));
 7002   match(Set dst ( RShiftVB src shift));
 7003   match(Set dst (URShiftVB src shift));
 7004   effect(TEMP dst, TEMP vtmp);
 7005   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7006   ins_encode %{
 7007     assert(UseAVX >= 2, "required");
 7008 
 7009     int opcode = this->ideal_Opcode();
 7010     int vlen_enc = Assembler::AVX_128bit;
 7011     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7012     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7013   %}
 7014   ins_pipe( pipe_slow );
 7015 %}
 7016 
 7017 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7018   predicate(Matcher::vector_length(n) == 16 &&
 7019             n->as_ShiftV()->is_var_shift() &&
 7020             !VM_Version::supports_avx512bw());
 7021   match(Set dst ( LShiftVB src shift));
 7022   match(Set dst ( RShiftVB src shift));
 7023   match(Set dst (URShiftVB src shift));
 7024   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7025   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7026   ins_encode %{
 7027     assert(UseAVX >= 2, "required");
 7028 
 7029     int opcode = this->ideal_Opcode();
 7030     int vlen_enc = Assembler::AVX_128bit;
 7031     // Shift lower half and get word result in dst
 7032     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7033 
 7034     // Shift upper half and get word result in vtmp1
 7035     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7036     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7037     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7038 
 7039     // Merge and down convert the two word results to byte in dst
 7040     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7041   %}
 7042   ins_pipe( pipe_slow );
 7043 %}
 7044 
 7045 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7046   predicate(Matcher::vector_length(n) == 32 &&
 7047             n->as_ShiftV()->is_var_shift() &&
 7048             !VM_Version::supports_avx512bw());
 7049   match(Set dst ( LShiftVB src shift));
 7050   match(Set dst ( RShiftVB src shift));
 7051   match(Set dst (URShiftVB src shift));
 7052   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7053   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7054   ins_encode %{
 7055     assert(UseAVX >= 2, "required");
 7056 
 7057     int opcode = this->ideal_Opcode();
 7058     int vlen_enc = Assembler::AVX_128bit;
 7059     // Process lower 128 bits and get result in dst
 7060     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7061     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7062     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7063     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7064     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7065 
 7066     // Process higher 128 bits and get result in vtmp3
 7067     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7068     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7069     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7070     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7071     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7072     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7073     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7074 
 7075     // Merge the two results in dst
 7076     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7077   %}
 7078   ins_pipe( pipe_slow );
 7079 %}
 7080 
 7081 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7082   predicate(Matcher::vector_length(n) <= 32 &&
 7083             n->as_ShiftV()->is_var_shift() &&
 7084             VM_Version::supports_avx512bw());
 7085   match(Set dst ( LShiftVB src shift));
 7086   match(Set dst ( RShiftVB src shift));
 7087   match(Set dst (URShiftVB src shift));
 7088   effect(TEMP dst, TEMP vtmp);
 7089   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7090   ins_encode %{
 7091     assert(UseAVX > 2, "required");
 7092 
 7093     int opcode = this->ideal_Opcode();
 7094     int vlen_enc = vector_length_encoding(this);
 7095     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7096   %}
 7097   ins_pipe( pipe_slow );
 7098 %}
 7099 
 7100 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7101   predicate(Matcher::vector_length(n) == 64 &&
 7102             n->as_ShiftV()->is_var_shift() &&
 7103             VM_Version::supports_avx512bw());
 7104   match(Set dst ( LShiftVB src shift));
 7105   match(Set dst ( RShiftVB src shift));
 7106   match(Set dst (URShiftVB src shift));
 7107   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7108   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7109   ins_encode %{
 7110     assert(UseAVX > 2, "required");
 7111 
 7112     int opcode = this->ideal_Opcode();
 7113     int vlen_enc = Assembler::AVX_256bit;
 7114     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7115     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7116     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7117     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7118     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7119   %}
 7120   ins_pipe( pipe_slow );
 7121 %}
 7122 
 7123 // Short variable shift
 7124 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7125   predicate(Matcher::vector_length(n) <= 8 &&
 7126             n->as_ShiftV()->is_var_shift() &&
 7127             !VM_Version::supports_avx512bw());
 7128   match(Set dst ( LShiftVS src shift));
 7129   match(Set dst ( RShiftVS src shift));
 7130   match(Set dst (URShiftVS src shift));
 7131   effect(TEMP dst, TEMP vtmp);
 7132   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7133   ins_encode %{
 7134     assert(UseAVX >= 2, "required");
 7135 
 7136     int opcode = this->ideal_Opcode();
 7137     bool sign = (opcode != Op_URShiftVS);
 7138     int vlen_enc = Assembler::AVX_256bit;
 7139     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7140     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7141     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7142     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7143     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7144     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7145   %}
 7146   ins_pipe( pipe_slow );
 7147 %}
 7148 
 7149 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7150   predicate(Matcher::vector_length(n) == 16 &&
 7151             n->as_ShiftV()->is_var_shift() &&
 7152             !VM_Version::supports_avx512bw());
 7153   match(Set dst ( LShiftVS src shift));
 7154   match(Set dst ( RShiftVS src shift));
 7155   match(Set dst (URShiftVS src shift));
 7156   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7157   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7158   ins_encode %{
 7159     assert(UseAVX >= 2, "required");
 7160 
 7161     int opcode = this->ideal_Opcode();
 7162     bool sign = (opcode != Op_URShiftVS);
 7163     int vlen_enc = Assembler::AVX_256bit;
 7164     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7165     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7166     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7167     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7168     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7169 
 7170     // Shift upper half, with result in dst using vtmp1 as TEMP
 7171     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7172     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7173     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7174     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7175     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7176     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7177 
 7178     // Merge lower and upper half result into dst
 7179     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7180     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7181   %}
 7182   ins_pipe( pipe_slow );
 7183 %}
 7184 
 7185 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7186   predicate(n->as_ShiftV()->is_var_shift() &&
 7187             VM_Version::supports_avx512bw());
 7188   match(Set dst ( LShiftVS src shift));
 7189   match(Set dst ( RShiftVS src shift));
 7190   match(Set dst (URShiftVS src shift));
 7191   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7192   ins_encode %{
 7193     assert(UseAVX > 2, "required");
 7194 
 7195     int opcode = this->ideal_Opcode();
 7196     int vlen_enc = vector_length_encoding(this);
 7197     if (!VM_Version::supports_avx512vl()) {
 7198       vlen_enc = Assembler::AVX_512bit;
 7199     }
 7200     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7201   %}
 7202   ins_pipe( pipe_slow );
 7203 %}
 7204 
 7205 //Integer variable shift
 7206 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7207   predicate(n->as_ShiftV()->is_var_shift());
 7208   match(Set dst ( LShiftVI src shift));
 7209   match(Set dst ( RShiftVI src shift));
 7210   match(Set dst (URShiftVI src shift));
 7211   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7212   ins_encode %{
 7213     assert(UseAVX >= 2, "required");
 7214 
 7215     int opcode = this->ideal_Opcode();
 7216     int vlen_enc = vector_length_encoding(this);
 7217     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7218   %}
 7219   ins_pipe( pipe_slow );
 7220 %}
 7221 
 7222 //Long variable shift
 7223 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7224   predicate(n->as_ShiftV()->is_var_shift());
 7225   match(Set dst ( LShiftVL src shift));
 7226   match(Set dst (URShiftVL src shift));
 7227   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7228   ins_encode %{
 7229     assert(UseAVX >= 2, "required");
 7230 
 7231     int opcode = this->ideal_Opcode();
 7232     int vlen_enc = vector_length_encoding(this);
 7233     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7234   %}
 7235   ins_pipe( pipe_slow );
 7236 %}
 7237 
 7238 //Long variable right shift arithmetic
 7239 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7240   predicate(Matcher::vector_length(n) <= 4 &&
 7241             n->as_ShiftV()->is_var_shift() &&
 7242             UseAVX == 2);
 7243   match(Set dst (RShiftVL src shift));
 7244   effect(TEMP dst, TEMP vtmp);
 7245   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7246   ins_encode %{
 7247     int opcode = this->ideal_Opcode();
 7248     int vlen_enc = vector_length_encoding(this);
 7249     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7250                  $vtmp$$XMMRegister);
 7251   %}
 7252   ins_pipe( pipe_slow );
 7253 %}
 7254 
 7255 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7256   predicate(n->as_ShiftV()->is_var_shift() &&
 7257             UseAVX > 2);
 7258   match(Set dst (RShiftVL src shift));
 7259   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7260   ins_encode %{
 7261     int opcode = this->ideal_Opcode();
 7262     int vlen_enc = vector_length_encoding(this);
 7263     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7264   %}
 7265   ins_pipe( pipe_slow );
 7266 %}
 7267 
 7268 // --------------------------------- AND --------------------------------------
 7269 
 7270 instruct vand(vec dst, vec src) %{
 7271   predicate(UseAVX == 0);
 7272   match(Set dst (AndV dst src));
 7273   format %{ "pand    $dst,$src\t! and vectors" %}
 7274   ins_encode %{
 7275     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7276   %}
 7277   ins_pipe( pipe_slow );
 7278 %}
 7279 
 7280 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7281   predicate(UseAVX > 0);
 7282   match(Set dst (AndV src1 src2));
 7283   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7284   ins_encode %{
 7285     int vlen_enc = vector_length_encoding(this);
 7286     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7287   %}
 7288   ins_pipe( pipe_slow );
 7289 %}
 7290 
 7291 instruct vand_mem(vec dst, vec src, memory mem) %{
 7292   predicate((UseAVX > 0) &&
 7293             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7294   match(Set dst (AndV src (LoadVector mem)));
 7295   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7296   ins_encode %{
 7297     int vlen_enc = vector_length_encoding(this);
 7298     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7299   %}
 7300   ins_pipe( pipe_slow );
 7301 %}
 7302 
 7303 // --------------------------------- OR ---------------------------------------
 7304 
 7305 instruct vor(vec dst, vec src) %{
 7306   predicate(UseAVX == 0);
 7307   match(Set dst (OrV dst src));
 7308   format %{ "por     $dst,$src\t! or vectors" %}
 7309   ins_encode %{
 7310     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7311   %}
 7312   ins_pipe( pipe_slow );
 7313 %}
 7314 
 7315 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7316   predicate(UseAVX > 0);
 7317   match(Set dst (OrV src1 src2));
 7318   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7319   ins_encode %{
 7320     int vlen_enc = vector_length_encoding(this);
 7321     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7322   %}
 7323   ins_pipe( pipe_slow );
 7324 %}
 7325 
 7326 instruct vor_mem(vec dst, vec src, memory mem) %{
 7327   predicate((UseAVX > 0) &&
 7328             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7329   match(Set dst (OrV src (LoadVector mem)));
 7330   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7331   ins_encode %{
 7332     int vlen_enc = vector_length_encoding(this);
 7333     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7334   %}
 7335   ins_pipe( pipe_slow );
 7336 %}
 7337 
 7338 // --------------------------------- XOR --------------------------------------
 7339 
 7340 instruct vxor(vec dst, vec src) %{
 7341   predicate(UseAVX == 0);
 7342   match(Set dst (XorV dst src));
 7343   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7344   ins_encode %{
 7345     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7346   %}
 7347   ins_pipe( pipe_slow );
 7348 %}
 7349 
 7350 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7351   predicate(UseAVX > 0);
 7352   match(Set dst (XorV src1 src2));
 7353   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7354   ins_encode %{
 7355     int vlen_enc = vector_length_encoding(this);
 7356     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7357   %}
 7358   ins_pipe( pipe_slow );
 7359 %}
 7360 
 7361 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7362   predicate((UseAVX > 0) &&
 7363             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7364   match(Set dst (XorV src (LoadVector mem)));
 7365   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7366   ins_encode %{
 7367     int vlen_enc = vector_length_encoding(this);
 7368     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7369   %}
 7370   ins_pipe( pipe_slow );
 7371 %}
 7372 
 7373 // --------------------------------- VectorCast --------------------------------------
 7374 
 7375 instruct vcastBtoX(vec dst, vec src) %{
 7376   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7377   match(Set dst (VectorCastB2X src));
 7378   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7379   ins_encode %{
 7380     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7381     int vlen_enc = vector_length_encoding(this);
 7382     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7383   %}
 7384   ins_pipe( pipe_slow );
 7385 %}
 7386 
 7387 instruct vcastBtoD(legVec dst, legVec src) %{
 7388   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7389   match(Set dst (VectorCastB2X src));
 7390   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7391   ins_encode %{
 7392     int vlen_enc = vector_length_encoding(this);
 7393     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7394   %}
 7395   ins_pipe( pipe_slow );
 7396 %}
 7397 
 7398 instruct castStoX(vec dst, vec src) %{
 7399   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7400             Matcher::vector_length(n->in(1)) <= 8 && // src
 7401             Matcher::vector_element_basic_type(n) == T_BYTE);
 7402   match(Set dst (VectorCastS2X src));
 7403   format %{ "vector_cast_s2x $dst,$src" %}
 7404   ins_encode %{
 7405     assert(UseAVX > 0, "required");
 7406 
 7407     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7408     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7409   %}
 7410   ins_pipe( pipe_slow );
 7411 %}
 7412 
 7413 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7414   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7415             Matcher::vector_length(n->in(1)) == 16 && // src
 7416             Matcher::vector_element_basic_type(n) == T_BYTE);
 7417   effect(TEMP dst, TEMP vtmp);
 7418   match(Set dst (VectorCastS2X src));
 7419   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7420   ins_encode %{
 7421     assert(UseAVX > 0, "required");
 7422 
 7423     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7424     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7425     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7426     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7427   %}
 7428   ins_pipe( pipe_slow );
 7429 %}
 7430 
 7431 instruct vcastStoX_evex(vec dst, vec src) %{
 7432   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7433             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7434   match(Set dst (VectorCastS2X src));
 7435   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7436   ins_encode %{
 7437     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7438     int src_vlen_enc = vector_length_encoding(this, $src);
 7439     int vlen_enc = vector_length_encoding(this);
 7440     switch (to_elem_bt) {
 7441       case T_BYTE:
 7442         if (!VM_Version::supports_avx512vl()) {
 7443           vlen_enc = Assembler::AVX_512bit;
 7444         }
 7445         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7446         break;
 7447       case T_INT:
 7448         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7449         break;
 7450       case T_FLOAT:
 7451         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7452         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7453         break;
 7454       case T_LONG:
 7455         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7456         break;
 7457       case T_DOUBLE: {
 7458         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7459         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7460         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7461         break;
 7462       }
 7463       default:
 7464         ShouldNotReachHere();
 7465     }
 7466   %}
 7467   ins_pipe( pipe_slow );
 7468 %}
 7469 
 7470 instruct castItoX(vec dst, vec src) %{
 7471   predicate(UseAVX <= 2 &&
 7472             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7473             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7474   match(Set dst (VectorCastI2X src));
 7475   format %{ "vector_cast_i2x $dst,$src" %}
 7476   ins_encode %{
 7477     assert(UseAVX > 0, "required");
 7478 
 7479     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7480     int vlen_enc = vector_length_encoding(this, $src);
 7481 
 7482     if (to_elem_bt == T_BYTE) {
 7483       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7484       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7485       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7486     } else {
 7487       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7488       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7489       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7490     }
 7491   %}
 7492   ins_pipe( pipe_slow );
 7493 %}
 7494 
 7495 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7496   predicate(UseAVX <= 2 &&
 7497             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7498             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7499   match(Set dst (VectorCastI2X src));
 7500   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7501   effect(TEMP dst, TEMP vtmp);
 7502   ins_encode %{
 7503     assert(UseAVX > 0, "required");
 7504 
 7505     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7506     int vlen_enc = vector_length_encoding(this, $src);
 7507 
 7508     if (to_elem_bt == T_BYTE) {
 7509       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7510       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7511       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7512       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7513     } else {
 7514       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7515       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7516       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7517       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7518     }
 7519   %}
 7520   ins_pipe( pipe_slow );
 7521 %}
 7522 
 7523 instruct vcastItoX_evex(vec dst, vec src) %{
 7524   predicate(UseAVX > 2 ||
 7525             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7526   match(Set dst (VectorCastI2X src));
 7527   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7528   ins_encode %{
 7529     assert(UseAVX > 0, "required");
 7530 
 7531     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7532     int src_vlen_enc = vector_length_encoding(this, $src);
 7533     int dst_vlen_enc = vector_length_encoding(this);
 7534     switch (dst_elem_bt) {
 7535       case T_BYTE:
 7536         if (!VM_Version::supports_avx512vl()) {
 7537           src_vlen_enc = Assembler::AVX_512bit;
 7538         }
 7539         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7540         break;
 7541       case T_SHORT:
 7542         if (!VM_Version::supports_avx512vl()) {
 7543           src_vlen_enc = Assembler::AVX_512bit;
 7544         }
 7545         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7546         break;
 7547       case T_FLOAT:
 7548         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7549         break;
 7550       case T_LONG:
 7551         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7552         break;
 7553       case T_DOUBLE:
 7554         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7555         break;
 7556       default:
 7557         ShouldNotReachHere();
 7558     }
 7559   %}
 7560   ins_pipe( pipe_slow );
 7561 %}
 7562 
 7563 instruct vcastLtoBS(vec dst, vec src) %{
 7564   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7565             UseAVX <= 2);
 7566   match(Set dst (VectorCastL2X src));
 7567   format %{ "vector_cast_l2x  $dst,$src" %}
 7568   ins_encode %{
 7569     assert(UseAVX > 0, "required");
 7570 
 7571     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7572     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7573     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7574                                                       : ExternalAddress(vector_int_to_short_mask());
 7575     if (vlen <= 16) {
 7576       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7577       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7578       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7579     } else {
 7580       assert(vlen <= 32, "required");
 7581       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7582       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7583       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7584       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7585     }
 7586     if (to_elem_bt == T_BYTE) {
 7587       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7588     }
 7589   %}
 7590   ins_pipe( pipe_slow );
 7591 %}
 7592 
 7593 instruct vcastLtoX_evex(vec dst, vec src) %{
 7594   predicate(UseAVX > 2 ||
 7595             (Matcher::vector_element_basic_type(n) == T_INT ||
 7596              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7597              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7598   match(Set dst (VectorCastL2X src));
 7599   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7600   ins_encode %{
 7601     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7602     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7603     int vlen_enc = vector_length_encoding(this, $src);
 7604     switch (to_elem_bt) {
 7605       case T_BYTE:
 7606         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7607           vlen_enc = Assembler::AVX_512bit;
 7608         }
 7609         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7610         break;
 7611       case T_SHORT:
 7612         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7613           vlen_enc = Assembler::AVX_512bit;
 7614         }
 7615         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7616         break;
 7617       case T_INT:
 7618         if (vlen == 8) {
 7619           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7620             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7621           }
 7622         } else if (vlen == 16) {
 7623           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7624         } else if (vlen == 32) {
 7625           if (UseAVX > 2) {
 7626             if (!VM_Version::supports_avx512vl()) {
 7627               vlen_enc = Assembler::AVX_512bit;
 7628             }
 7629             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7630           } else {
 7631             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7632             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7633           }
 7634         } else { // vlen == 64
 7635           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7636         }
 7637         break;
 7638       case T_FLOAT:
 7639         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7640         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7641         break;
 7642       case T_DOUBLE:
 7643         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7644         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7645         break;
 7646 
 7647       default: assert(false, "%s", type2name(to_elem_bt));
 7648     }
 7649   %}
 7650   ins_pipe( pipe_slow );
 7651 %}
 7652 
 7653 instruct vcastFtoD_reg(vec dst, vec src) %{
 7654   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7655   match(Set dst (VectorCastF2X src));
 7656   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7657   ins_encode %{
 7658     int vlen_enc = vector_length_encoding(this);
 7659     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7660   %}
 7661   ins_pipe( pipe_slow );
 7662 %}
 7663 
 7664 
 7665 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7666   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7667             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7668   match(Set dst (VectorCastF2X src));
 7669   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7670   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7671   ins_encode %{
 7672     int vlen_enc = vector_length_encoding(this, $src);
 7673     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7674     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7675     // 32 bit addresses for register indirect addressing mode since stub constants
 7676     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7677     // However, targets are free to increase this limit, but having a large code cache size
 7678     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7679     // cap we save a temporary register allocation which in limiting case can prevent
 7680     // spilling in high register pressure blocks.
 7681     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7682                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7683                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7684   %}
 7685   ins_pipe( pipe_slow );
 7686 %}
 7687 
 7688 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7689   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7690             is_integral_type(Matcher::vector_element_basic_type(n)));
 7691   match(Set dst (VectorCastF2X src));
 7692   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7693   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7694   ins_encode %{
 7695     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7696     if (to_elem_bt == T_LONG) {
 7697       int vlen_enc = vector_length_encoding(this);
 7698       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7699                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7700                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7701     } else {
 7702       int vlen_enc = vector_length_encoding(this, $src);
 7703       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7704                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7705                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7706     }
 7707   %}
 7708   ins_pipe( pipe_slow );
 7709 %}
 7710 
 7711 instruct vcastDtoF_reg(vec dst, vec src) %{
 7712   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7713   match(Set dst (VectorCastD2X src));
 7714   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7715   ins_encode %{
 7716     int vlen_enc = vector_length_encoding(this, $src);
 7717     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7718   %}
 7719   ins_pipe( pipe_slow );
 7720 %}
 7721 
 7722 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7723   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7724             is_integral_type(Matcher::vector_element_basic_type(n)));
 7725   match(Set dst (VectorCastD2X src));
 7726   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7727   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7728   ins_encode %{
 7729     int vlen_enc = vector_length_encoding(this, $src);
 7730     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7731     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7732                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7733                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7734   %}
 7735   ins_pipe( pipe_slow );
 7736 %}
 7737 
 7738 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7739   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7740             is_integral_type(Matcher::vector_element_basic_type(n)));
 7741   match(Set dst (VectorCastD2X src));
 7742   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7743   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7744   ins_encode %{
 7745     int vlen_enc = vector_length_encoding(this, $src);
 7746     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7747     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7748                               ExternalAddress(vector_float_signflip());
 7749     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7750                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7751   %}
 7752   ins_pipe( pipe_slow );
 7753 %}
 7754 
 7755 instruct vucast(vec dst, vec src) %{
 7756   match(Set dst (VectorUCastB2X src));
 7757   match(Set dst (VectorUCastS2X src));
 7758   match(Set dst (VectorUCastI2X src));
 7759   format %{ "vector_ucast $dst,$src\t!" %}
 7760   ins_encode %{
 7761     assert(UseAVX > 0, "required");
 7762 
 7763     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7764     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7765     int vlen_enc = vector_length_encoding(this);
 7766     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7767   %}
 7768   ins_pipe( pipe_slow );
 7769 %}
 7770 
 7771 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7772   predicate(!VM_Version::supports_avx512vl() &&
 7773             Matcher::vector_length_in_bytes(n) < 64 &&
 7774             Matcher::vector_element_basic_type(n) == T_INT);
 7775   match(Set dst (RoundVF src));
 7776   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7777   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7778   ins_encode %{
 7779     int vlen_enc = vector_length_encoding(this);
 7780     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7781     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7782                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7783                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7784   %}
 7785   ins_pipe( pipe_slow );
 7786 %}
 7787 
 7788 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7789   predicate((VM_Version::supports_avx512vl() ||
 7790              Matcher::vector_length_in_bytes(n) == 64) &&
 7791              Matcher::vector_element_basic_type(n) == T_INT);
 7792   match(Set dst (RoundVF src));
 7793   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7794   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7795   ins_encode %{
 7796     int vlen_enc = vector_length_encoding(this);
 7797     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7798     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7799                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7800                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7801   %}
 7802   ins_pipe( pipe_slow );
 7803 %}
 7804 
 7805 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7806   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7807   match(Set dst (RoundVD src));
 7808   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7809   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7810   ins_encode %{
 7811     int vlen_enc = vector_length_encoding(this);
 7812     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7813     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7814                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7815                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7816   %}
 7817   ins_pipe( pipe_slow );
 7818 %}
 7819 
 7820 // --------------------------------- VectorMaskCmp --------------------------------------
 7821 
 7822 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7823   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7824             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7825             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7826             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7827   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7828   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7829   ins_encode %{
 7830     int vlen_enc = vector_length_encoding(this, $src1);
 7831     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7832     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7833       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7834     } else {
 7835       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7836     }
 7837   %}
 7838   ins_pipe( pipe_slow );
 7839 %}
 7840 
 7841 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7842   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7843             n->bottom_type()->isa_vectmask() == nullptr &&
 7844             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7845   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7846   effect(TEMP ktmp);
 7847   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7848   ins_encode %{
 7849     int vlen_enc = Assembler::AVX_512bit;
 7850     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7851     KRegister mask = k0; // The comparison itself is not being masked.
 7852     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7853       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7854       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7855     } else {
 7856       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7857       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7858     }
 7859   %}
 7860   ins_pipe( pipe_slow );
 7861 %}
 7862 
 7863 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7864   predicate(n->bottom_type()->isa_vectmask() &&
 7865             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7866   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7867   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7868   ins_encode %{
 7869     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7870     int vlen_enc = vector_length_encoding(this, $src1);
 7871     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7872     KRegister mask = k0; // The comparison itself is not being masked.
 7873     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7874       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7875     } else {
 7876       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7877     }
 7878   %}
 7879   ins_pipe( pipe_slow );
 7880 %}
 7881 
 7882 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7883   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7884             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7885             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7886             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7887             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7888             (n->in(2)->get_int() == BoolTest::eq ||
 7889              n->in(2)->get_int() == BoolTest::lt ||
 7890              n->in(2)->get_int() == BoolTest::gt)); // cond
 7891   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7892   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7893   ins_encode %{
 7894     int vlen_enc = vector_length_encoding(this, $src1);
 7895     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7896     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7897     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7898   %}
 7899   ins_pipe( pipe_slow );
 7900 %}
 7901 
 7902 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7903   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7904             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7905             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7906             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7907             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7908             (n->in(2)->get_int() == BoolTest::ne ||
 7909              n->in(2)->get_int() == BoolTest::le ||
 7910              n->in(2)->get_int() == BoolTest::ge)); // cond
 7911   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7912   effect(TEMP dst, TEMP xtmp);
 7913   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7914   ins_encode %{
 7915     int vlen_enc = vector_length_encoding(this, $src1);
 7916     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7917     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7918     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7919   %}
 7920   ins_pipe( pipe_slow );
 7921 %}
 7922 
 7923 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7924   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7925             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7926             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7927             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7928             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7929   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7930   effect(TEMP dst, TEMP xtmp);
 7931   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7932   ins_encode %{
 7933     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7934     int vlen_enc = vector_length_encoding(this, $src1);
 7935     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7936     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7937 
 7938     if (vlen_enc == Assembler::AVX_128bit) {
 7939       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7940     } else {
 7941       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7942     }
 7943     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7944     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7945     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7946   %}
 7947   ins_pipe( pipe_slow );
 7948 %}
 7949 
 7950 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7951   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7952              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7953              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7954   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7955   effect(TEMP ktmp);
 7956   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7957   ins_encode %{
 7958     assert(UseAVX > 2, "required");
 7959 
 7960     int vlen_enc = vector_length_encoding(this, $src1);
 7961     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7962     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7963     KRegister mask = k0; // The comparison itself is not being masked.
 7964     bool merge = false;
 7965     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7966 
 7967     switch (src1_elem_bt) {
 7968       case T_INT: {
 7969         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7970         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7971         break;
 7972       }
 7973       case T_LONG: {
 7974         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7975         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7976         break;
 7977       }
 7978       default: assert(false, "%s", type2name(src1_elem_bt));
 7979     }
 7980   %}
 7981   ins_pipe( pipe_slow );
 7982 %}
 7983 
 7984 
 7985 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7986   predicate(n->bottom_type()->isa_vectmask() &&
 7987             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7988   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7989   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7990   ins_encode %{
 7991     assert(UseAVX > 2, "required");
 7992     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7993 
 7994     int vlen_enc = vector_length_encoding(this, $src1);
 7995     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7996     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7997     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7998 
 7999     // Comparison i
 8000     switch (src1_elem_bt) {
 8001       case T_BYTE: {
 8002         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8003         break;
 8004       }
 8005       case T_SHORT: {
 8006         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8007         break;
 8008       }
 8009       case T_INT: {
 8010         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8011         break;
 8012       }
 8013       case T_LONG: {
 8014         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8015         break;
 8016       }
 8017       default: assert(false, "%s", type2name(src1_elem_bt));
 8018     }
 8019   %}
 8020   ins_pipe( pipe_slow );
 8021 %}
 8022 
 8023 // Extract
 8024 
 8025 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8026   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8027   match(Set dst (ExtractI src idx));
 8028   match(Set dst (ExtractS src idx));
 8029   match(Set dst (ExtractB src idx));
 8030   format %{ "extractI $dst,$src,$idx\t!" %}
 8031   ins_encode %{
 8032     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8033 
 8034     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8035     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8036   %}
 8037   ins_pipe( pipe_slow );
 8038 %}
 8039 
 8040 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8041   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8042             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8043   match(Set dst (ExtractI src idx));
 8044   match(Set dst (ExtractS src idx));
 8045   match(Set dst (ExtractB src idx));
 8046   effect(TEMP vtmp);
 8047   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8048   ins_encode %{
 8049     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8050 
 8051     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8052     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8053     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8054   %}
 8055   ins_pipe( pipe_slow );
 8056 %}
 8057 
 8058 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8059   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8060   match(Set dst (ExtractL src idx));
 8061   format %{ "extractL $dst,$src,$idx\t!" %}
 8062   ins_encode %{
 8063     assert(UseSSE >= 4, "required");
 8064     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8065 
 8066     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8067   %}
 8068   ins_pipe( pipe_slow );
 8069 %}
 8070 
 8071 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8072   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8073             Matcher::vector_length(n->in(1)) == 8);  // src
 8074   match(Set dst (ExtractL src idx));
 8075   effect(TEMP vtmp);
 8076   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8077   ins_encode %{
 8078     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8079 
 8080     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8081     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8082   %}
 8083   ins_pipe( pipe_slow );
 8084 %}
 8085 
 8086 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8087   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8088   match(Set dst (ExtractF src idx));
 8089   effect(TEMP dst, TEMP vtmp);
 8090   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8091   ins_encode %{
 8092     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8093 
 8094     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8095   %}
 8096   ins_pipe( pipe_slow );
 8097 %}
 8098 
 8099 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8100   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8101             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8102   match(Set dst (ExtractF src idx));
 8103   effect(TEMP vtmp);
 8104   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8105   ins_encode %{
 8106     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8107 
 8108     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8109     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8110   %}
 8111   ins_pipe( pipe_slow );
 8112 %}
 8113 
 8114 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8115   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8116   match(Set dst (ExtractD src idx));
 8117   format %{ "extractD $dst,$src,$idx\t!" %}
 8118   ins_encode %{
 8119     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8120 
 8121     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8122   %}
 8123   ins_pipe( pipe_slow );
 8124 %}
 8125 
 8126 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8127   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8128             Matcher::vector_length(n->in(1)) == 8);  // src
 8129   match(Set dst (ExtractD src idx));
 8130   effect(TEMP vtmp);
 8131   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8132   ins_encode %{
 8133     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8134 
 8135     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8136     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8137   %}
 8138   ins_pipe( pipe_slow );
 8139 %}
 8140 
 8141 // --------------------------------- Vector Blend --------------------------------------
 8142 
 8143 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8144   predicate(UseAVX == 0);
 8145   match(Set dst (VectorBlend (Binary dst src) mask));
 8146   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8147   effect(TEMP tmp);
 8148   ins_encode %{
 8149     assert(UseSSE >= 4, "required");
 8150 
 8151     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8152       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8153     }
 8154     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8155   %}
 8156   ins_pipe( pipe_slow );
 8157 %}
 8158 
 8159 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8160   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8161             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8162             Matcher::vector_length_in_bytes(n) <= 32 &&
 8163             is_integral_type(Matcher::vector_element_basic_type(n)));
 8164   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8165   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8166   ins_encode %{
 8167     int vlen_enc = vector_length_encoding(this);
 8168     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8169   %}
 8170   ins_pipe( pipe_slow );
 8171 %}
 8172 
 8173 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8174   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8175             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8176             Matcher::vector_length_in_bytes(n) <= 32 &&
 8177             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8178   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8179   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8180   ins_encode %{
 8181     int vlen_enc = vector_length_encoding(this);
 8182     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8183   %}
 8184   ins_pipe( pipe_slow );
 8185 %}
 8186 
 8187 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8188   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8189             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8190             Matcher::vector_length_in_bytes(n) <= 32);
 8191   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8192   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8193   effect(TEMP vtmp, TEMP dst);
 8194   ins_encode %{
 8195     int vlen_enc = vector_length_encoding(this);
 8196     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8197     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8198     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8199   %}
 8200   ins_pipe( pipe_slow );
 8201 %}
 8202 
 8203 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8204   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8205             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8206   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8207   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8208   effect(TEMP ktmp);
 8209   ins_encode %{
 8210      int vlen_enc = Assembler::AVX_512bit;
 8211      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8212     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8213     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8214   %}
 8215   ins_pipe( pipe_slow );
 8216 %}
 8217 
 8218 
 8219 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8220   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8221             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8222              VM_Version::supports_avx512bw()));
 8223   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8224   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8225   ins_encode %{
 8226     int vlen_enc = vector_length_encoding(this);
 8227     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8228     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8229   %}
 8230   ins_pipe( pipe_slow );
 8231 %}
 8232 
 8233 // --------------------------------- ABS --------------------------------------
 8234 // a = |a|
 8235 instruct vabsB_reg(vec dst, vec src) %{
 8236   match(Set dst (AbsVB  src));
 8237   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8238   ins_encode %{
 8239     uint vlen = Matcher::vector_length(this);
 8240     if (vlen <= 16) {
 8241       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8242     } else {
 8243       int vlen_enc = vector_length_encoding(this);
 8244       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8245     }
 8246   %}
 8247   ins_pipe( pipe_slow );
 8248 %}
 8249 
 8250 instruct vabsS_reg(vec dst, vec src) %{
 8251   match(Set dst (AbsVS  src));
 8252   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8253   ins_encode %{
 8254     uint vlen = Matcher::vector_length(this);
 8255     if (vlen <= 8) {
 8256       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8257     } else {
 8258       int vlen_enc = vector_length_encoding(this);
 8259       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8260     }
 8261   %}
 8262   ins_pipe( pipe_slow );
 8263 %}
 8264 
 8265 instruct vabsI_reg(vec dst, vec src) %{
 8266   match(Set dst (AbsVI  src));
 8267   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8268   ins_encode %{
 8269     uint vlen = Matcher::vector_length(this);
 8270     if (vlen <= 4) {
 8271       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8272     } else {
 8273       int vlen_enc = vector_length_encoding(this);
 8274       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8275     }
 8276   %}
 8277   ins_pipe( pipe_slow );
 8278 %}
 8279 
 8280 instruct vabsL_reg(vec dst, vec src) %{
 8281   match(Set dst (AbsVL  src));
 8282   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8283   ins_encode %{
 8284     assert(UseAVX > 2, "required");
 8285     int vlen_enc = vector_length_encoding(this);
 8286     if (!VM_Version::supports_avx512vl()) {
 8287       vlen_enc = Assembler::AVX_512bit;
 8288     }
 8289     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8290   %}
 8291   ins_pipe( pipe_slow );
 8292 %}
 8293 
 8294 // --------------------------------- ABSNEG --------------------------------------
 8295 
 8296 instruct vabsnegF(vec dst, vec src) %{
 8297   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8298   match(Set dst (AbsVF src));
 8299   match(Set dst (NegVF src));
 8300   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8301   ins_cost(150);
 8302   ins_encode %{
 8303     int opcode = this->ideal_Opcode();
 8304     int vlen = Matcher::vector_length(this);
 8305     if (vlen == 2) {
 8306       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8307     } else {
 8308       assert(vlen == 8 || vlen == 16, "required");
 8309       int vlen_enc = vector_length_encoding(this);
 8310       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8311     }
 8312   %}
 8313   ins_pipe( pipe_slow );
 8314 %}
 8315 
 8316 instruct vabsneg4F(vec dst) %{
 8317   predicate(Matcher::vector_length(n) == 4);
 8318   match(Set dst (AbsVF dst));
 8319   match(Set dst (NegVF dst));
 8320   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8321   ins_cost(150);
 8322   ins_encode %{
 8323     int opcode = this->ideal_Opcode();
 8324     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8325   %}
 8326   ins_pipe( pipe_slow );
 8327 %}
 8328 
 8329 instruct vabsnegD(vec dst, vec src) %{
 8330   match(Set dst (AbsVD  src));
 8331   match(Set dst (NegVD  src));
 8332   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8333   ins_encode %{
 8334     int opcode = this->ideal_Opcode();
 8335     uint vlen = Matcher::vector_length(this);
 8336     if (vlen == 2) {
 8337       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8338     } else {
 8339       int vlen_enc = vector_length_encoding(this);
 8340       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8341     }
 8342   %}
 8343   ins_pipe( pipe_slow );
 8344 %}
 8345 
 8346 //------------------------------------- VectorTest --------------------------------------------
 8347 
 8348 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8349   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8350   match(Set cr (VectorTest src1 src2));
 8351   effect(TEMP vtmp);
 8352   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8353   ins_encode %{
 8354     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8355     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8356     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8357   %}
 8358   ins_pipe( pipe_slow );
 8359 %}
 8360 
 8361 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8362   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8363   match(Set cr (VectorTest src1 src2));
 8364   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8365   ins_encode %{
 8366     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8367     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8368     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8369   %}
 8370   ins_pipe( pipe_slow );
 8371 %}
 8372 
 8373 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8374   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8375              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8376             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8377   match(Set cr (VectorTest src1 src2));
 8378   effect(TEMP tmp);
 8379   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8380   ins_encode %{
 8381     uint masklen = Matcher::vector_length(this, $src1);
 8382     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8383     __ andl($tmp$$Register, (1 << masklen) - 1);
 8384     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8385   %}
 8386   ins_pipe( pipe_slow );
 8387 %}
 8388 
 8389 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8390   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8391              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8392             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8393   match(Set cr (VectorTest src1 src2));
 8394   effect(TEMP tmp);
 8395   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8396   ins_encode %{
 8397     uint masklen = Matcher::vector_length(this, $src1);
 8398     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8399     __ andl($tmp$$Register, (1 << masklen) - 1);
 8400   %}
 8401   ins_pipe( pipe_slow );
 8402 %}
 8403 
 8404 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8405   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8406             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8407   match(Set cr (VectorTest src1 src2));
 8408   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8409   ins_encode %{
 8410     uint masklen = Matcher::vector_length(this, $src1);
 8411     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8412   %}
 8413   ins_pipe( pipe_slow );
 8414 %}
 8415 
 8416 //------------------------------------- LoadMask --------------------------------------------
 8417 
 8418 instruct loadMask(legVec dst, legVec src) %{
 8419   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8420   match(Set dst (VectorLoadMask src));
 8421   effect(TEMP dst);
 8422   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8423   ins_encode %{
 8424     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8425     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8426     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8427   %}
 8428   ins_pipe( pipe_slow );
 8429 %}
 8430 
 8431 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8432   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8433   match(Set dst (VectorLoadMask src));
 8434   effect(TEMP xtmp);
 8435   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8436   ins_encode %{
 8437     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8438                         true, Assembler::AVX_512bit);
 8439   %}
 8440   ins_pipe( pipe_slow );
 8441 %}
 8442 
 8443 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8444   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8445   match(Set dst (VectorLoadMask src));
 8446   effect(TEMP xtmp);
 8447   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8448   ins_encode %{
 8449     int vlen_enc = vector_length_encoding(in(1));
 8450     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8451                         false, vlen_enc);
 8452   %}
 8453   ins_pipe( pipe_slow );
 8454 %}
 8455 
 8456 //------------------------------------- StoreMask --------------------------------------------
 8457 
 8458 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8459   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8460   match(Set dst (VectorStoreMask src size));
 8461   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8462   ins_encode %{
 8463     int vlen = Matcher::vector_length(this);
 8464     if (vlen <= 16 && UseAVX <= 2) {
 8465       assert(UseSSE >= 3, "required");
 8466       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8467     } else {
 8468       assert(UseAVX > 0, "required");
 8469       int src_vlen_enc = vector_length_encoding(this, $src);
 8470       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8471     }
 8472   %}
 8473   ins_pipe( pipe_slow );
 8474 %}
 8475 
 8476 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8477   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8478   match(Set dst (VectorStoreMask src size));
 8479   effect(TEMP_DEF dst, TEMP xtmp);
 8480   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8481   ins_encode %{
 8482     int vlen_enc = Assembler::AVX_128bit;
 8483     int vlen = Matcher::vector_length(this);
 8484     if (vlen <= 8) {
 8485       assert(UseSSE >= 3, "required");
 8486       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8487       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8488       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8489     } else {
 8490       assert(UseAVX > 0, "required");
 8491       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8492       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8493       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8494     }
 8495   %}
 8496   ins_pipe( pipe_slow );
 8497 %}
 8498 
 8499 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8500   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8501   match(Set dst (VectorStoreMask src size));
 8502   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8503   effect(TEMP_DEF dst, TEMP xtmp);
 8504   ins_encode %{
 8505     int vlen_enc = Assembler::AVX_128bit;
 8506     int vlen = Matcher::vector_length(this);
 8507     if (vlen <= 4) {
 8508       assert(UseSSE >= 3, "required");
 8509       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8510       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8511       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8512       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8513     } else {
 8514       assert(UseAVX > 0, "required");
 8515       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8516       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8517       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8518       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8519       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8520     }
 8521   %}
 8522   ins_pipe( pipe_slow );
 8523 %}
 8524 
 8525 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8526   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8527   match(Set dst (VectorStoreMask src size));
 8528   effect(TEMP_DEF dst, TEMP xtmp);
 8529   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8530   ins_encode %{
 8531     assert(UseSSE >= 3, "required");
 8532     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8533     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8534     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8535     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8536     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8537   %}
 8538   ins_pipe( pipe_slow );
 8539 %}
 8540 
 8541 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8542   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8543   match(Set dst (VectorStoreMask src size));
 8544   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8545   effect(TEMP_DEF dst, TEMP vtmp);
 8546   ins_encode %{
 8547     int vlen_enc = Assembler::AVX_128bit;
 8548     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8549     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8550     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8551     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8552     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8553     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8554     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8555   %}
 8556   ins_pipe( pipe_slow );
 8557 %}
 8558 
 8559 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8560   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8561   match(Set dst (VectorStoreMask src size));
 8562   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8563   ins_encode %{
 8564     int src_vlen_enc = vector_length_encoding(this, $src);
 8565     int dst_vlen_enc = vector_length_encoding(this);
 8566     if (!VM_Version::supports_avx512vl()) {
 8567       src_vlen_enc = Assembler::AVX_512bit;
 8568     }
 8569     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8570     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8571   %}
 8572   ins_pipe( pipe_slow );
 8573 %}
 8574 
 8575 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8576   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8577   match(Set dst (VectorStoreMask src size));
 8578   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8579   ins_encode %{
 8580     int src_vlen_enc = vector_length_encoding(this, $src);
 8581     int dst_vlen_enc = vector_length_encoding(this);
 8582     if (!VM_Version::supports_avx512vl()) {
 8583       src_vlen_enc = Assembler::AVX_512bit;
 8584     }
 8585     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8586     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8587   %}
 8588   ins_pipe( pipe_slow );
 8589 %}
 8590 
 8591 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8592   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8593   match(Set dst (VectorStoreMask mask size));
 8594   effect(TEMP_DEF dst);
 8595   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8596   ins_encode %{
 8597     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8598     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8599                  false, Assembler::AVX_512bit, noreg);
 8600     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8601   %}
 8602   ins_pipe( pipe_slow );
 8603 %}
 8604 
 8605 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8606   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8607   match(Set dst (VectorStoreMask mask size));
 8608   effect(TEMP_DEF dst);
 8609   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8610   ins_encode %{
 8611     int dst_vlen_enc = vector_length_encoding(this);
 8612     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8613     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8614   %}
 8615   ins_pipe( pipe_slow );
 8616 %}
 8617 
 8618 instruct vmaskcast_evex(kReg dst) %{
 8619   match(Set dst (VectorMaskCast dst));
 8620   ins_cost(0);
 8621   format %{ "vector_mask_cast $dst" %}
 8622   ins_encode %{
 8623     // empty
 8624   %}
 8625   ins_pipe(empty);
 8626 %}
 8627 
 8628 instruct vmaskcast(vec dst) %{
 8629   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8630   match(Set dst (VectorMaskCast dst));
 8631   ins_cost(0);
 8632   format %{ "vector_mask_cast $dst" %}
 8633   ins_encode %{
 8634     // empty
 8635   %}
 8636   ins_pipe(empty);
 8637 %}
 8638 
 8639 instruct vmaskcast_avx(vec dst, vec src) %{
 8640   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8641   match(Set dst (VectorMaskCast src));
 8642   format %{ "vector_mask_cast $dst, $src" %}
 8643   ins_encode %{
 8644     int vlen = Matcher::vector_length(this);
 8645     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8646     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8647     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8648   %}
 8649   ins_pipe(pipe_slow);
 8650 %}
 8651 
 8652 //-------------------------------- Load Iota Indices ----------------------------------
 8653 
 8654 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8655   match(Set dst (VectorLoadConst src));
 8656   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8657   ins_encode %{
 8658      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8659      BasicType bt = Matcher::vector_element_basic_type(this);
 8660      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8661   %}
 8662   ins_pipe( pipe_slow );
 8663 %}
 8664 
 8665 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8666   match(Set dst (PopulateIndex src1 src2));
 8667   effect(TEMP dst, TEMP vtmp);
 8668   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8669   ins_encode %{
 8670      assert($src2$$constant == 1, "required");
 8671      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8672      int vlen_enc = vector_length_encoding(this);
 8673      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8674      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8675      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8676      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8677   %}
 8678   ins_pipe( pipe_slow );
 8679 %}
 8680 
 8681 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8682   match(Set dst (PopulateIndex src1 src2));
 8683   effect(TEMP dst, TEMP vtmp);
 8684   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8685   ins_encode %{
 8686      assert($src2$$constant == 1, "required");
 8687      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8688      int vlen_enc = vector_length_encoding(this);
 8689      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8690      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8691      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8692      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8693   %}
 8694   ins_pipe( pipe_slow );
 8695 %}
 8696 
 8697 //-------------------------------- Rearrange ----------------------------------
 8698 
 8699 // LoadShuffle/Rearrange for Byte
 8700 instruct rearrangeB(vec dst, vec shuffle) %{
 8701   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8702             Matcher::vector_length(n) < 32);
 8703   match(Set dst (VectorRearrange dst shuffle));
 8704   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8705   ins_encode %{
 8706     assert(UseSSE >= 4, "required");
 8707     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8708   %}
 8709   ins_pipe( pipe_slow );
 8710 %}
 8711 
 8712 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8713   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8714             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8715   match(Set dst (VectorRearrange src shuffle));
 8716   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8717   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8718   ins_encode %{
 8719     assert(UseAVX >= 2, "required");
 8720     // Swap src into vtmp1
 8721     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8722     // Shuffle swapped src to get entries from other 128 bit lane
 8723     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8724     // Shuffle original src to get entries from self 128 bit lane
 8725     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8726     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8727     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8728     // Perform the blend
 8729     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8730   %}
 8731   ins_pipe( pipe_slow );
 8732 %}
 8733 
 8734 
 8735 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8736   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8737             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8738   match(Set dst (VectorRearrange src shuffle));
 8739   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8740   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8741   ins_encode %{
 8742     int vlen_enc = vector_length_encoding(this);
 8743     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8744                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8745                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8746   %}
 8747   ins_pipe( pipe_slow );
 8748 %}
 8749 
 8750 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8751   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8752             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8753   match(Set dst (VectorRearrange src shuffle));
 8754   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8755   ins_encode %{
 8756     int vlen_enc = vector_length_encoding(this);
 8757     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8758   %}
 8759   ins_pipe( pipe_slow );
 8760 %}
 8761 
 8762 // LoadShuffle/Rearrange for Short
 8763 
 8764 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8765   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8766             !VM_Version::supports_avx512bw());
 8767   match(Set dst (VectorLoadShuffle src));
 8768   effect(TEMP dst, TEMP vtmp);
 8769   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8770   ins_encode %{
 8771     // Create a byte shuffle mask from short shuffle mask
 8772     // only byte shuffle instruction available on these platforms
 8773     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8774     if (UseAVX == 0) {
 8775       assert(vlen_in_bytes <= 16, "required");
 8776       // Multiply each shuffle by two to get byte index
 8777       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8778       __ psllw($vtmp$$XMMRegister, 1);
 8779 
 8780       // Duplicate to create 2 copies of byte index
 8781       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8782       __ psllw($dst$$XMMRegister, 8);
 8783       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8784 
 8785       // Add one to get alternate byte index
 8786       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8787       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8788     } else {
 8789       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8790       int vlen_enc = vector_length_encoding(this);
 8791       // Multiply each shuffle by two to get byte index
 8792       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8793 
 8794       // Duplicate to create 2 copies of byte index
 8795       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8796       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8797 
 8798       // Add one to get alternate byte index
 8799       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8800     }
 8801   %}
 8802   ins_pipe( pipe_slow );
 8803 %}
 8804 
 8805 instruct rearrangeS(vec dst, vec shuffle) %{
 8806   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8807             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8808   match(Set dst (VectorRearrange dst shuffle));
 8809   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8810   ins_encode %{
 8811     assert(UseSSE >= 4, "required");
 8812     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8813   %}
 8814   ins_pipe( pipe_slow );
 8815 %}
 8816 
 8817 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8818   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8819             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8820   match(Set dst (VectorRearrange src shuffle));
 8821   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8822   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8823   ins_encode %{
 8824     assert(UseAVX >= 2, "required");
 8825     // Swap src into vtmp1
 8826     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8827     // Shuffle swapped src to get entries from other 128 bit lane
 8828     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8829     // Shuffle original src to get entries from self 128 bit lane
 8830     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8831     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8832     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8833     // Perform the blend
 8834     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8835   %}
 8836   ins_pipe( pipe_slow );
 8837 %}
 8838 
 8839 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8840   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8841             VM_Version::supports_avx512bw());
 8842   match(Set dst (VectorRearrange src shuffle));
 8843   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8844   ins_encode %{
 8845     int vlen_enc = vector_length_encoding(this);
 8846     if (!VM_Version::supports_avx512vl()) {
 8847       vlen_enc = Assembler::AVX_512bit;
 8848     }
 8849     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8850   %}
 8851   ins_pipe( pipe_slow );
 8852 %}
 8853 
 8854 // LoadShuffle/Rearrange for Integer and Float
 8855 
 8856 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8857   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8858             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8859   match(Set dst (VectorLoadShuffle src));
 8860   effect(TEMP dst, TEMP vtmp);
 8861   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8862   ins_encode %{
 8863     assert(UseSSE >= 4, "required");
 8864 
 8865     // Create a byte shuffle mask from int shuffle mask
 8866     // only byte shuffle instruction available on these platforms
 8867 
 8868     // Duplicate and multiply each shuffle by 4
 8869     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8870     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8871     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8872     __ psllw($vtmp$$XMMRegister, 2);
 8873 
 8874     // Duplicate again to create 4 copies of byte index
 8875     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8876     __ psllw($dst$$XMMRegister, 8);
 8877     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8878 
 8879     // Add 3,2,1,0 to get alternate byte index
 8880     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8881     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8882   %}
 8883   ins_pipe( pipe_slow );
 8884 %}
 8885 
 8886 instruct rearrangeI(vec dst, vec shuffle) %{
 8887   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8888             UseAVX == 0);
 8889   match(Set dst (VectorRearrange dst shuffle));
 8890   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8891   ins_encode %{
 8892     assert(UseSSE >= 4, "required");
 8893     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8894   %}
 8895   ins_pipe( pipe_slow );
 8896 %}
 8897 
 8898 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8899   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8900             UseAVX > 0);
 8901   match(Set dst (VectorRearrange src shuffle));
 8902   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8903   ins_encode %{
 8904     int vlen_enc = vector_length_encoding(this);
 8905     BasicType bt = Matcher::vector_element_basic_type(this);
 8906     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8907   %}
 8908   ins_pipe( pipe_slow );
 8909 %}
 8910 
 8911 // LoadShuffle/Rearrange for Long and Double
 8912 
 8913 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8914   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8915             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8916   match(Set dst (VectorLoadShuffle src));
 8917   effect(TEMP dst, TEMP vtmp);
 8918   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8919   ins_encode %{
 8920     assert(UseAVX >= 2, "required");
 8921 
 8922     int vlen_enc = vector_length_encoding(this);
 8923     // Create a double word shuffle mask from long shuffle mask
 8924     // only double word shuffle instruction available on these platforms
 8925 
 8926     // Multiply each shuffle by two to get double word index
 8927     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8928 
 8929     // Duplicate each double word shuffle
 8930     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8931     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8932 
 8933     // Add one to get alternate double word index
 8934     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8935   %}
 8936   ins_pipe( pipe_slow );
 8937 %}
 8938 
 8939 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8940   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8941             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8942   match(Set dst (VectorRearrange src shuffle));
 8943   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8944   ins_encode %{
 8945     assert(UseAVX >= 2, "required");
 8946 
 8947     int vlen_enc = vector_length_encoding(this);
 8948     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8949   %}
 8950   ins_pipe( pipe_slow );
 8951 %}
 8952 
 8953 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8954   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8955             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8956   match(Set dst (VectorRearrange src shuffle));
 8957   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8958   ins_encode %{
 8959     assert(UseAVX > 2, "required");
 8960 
 8961     int vlen_enc = vector_length_encoding(this);
 8962     if (vlen_enc == Assembler::AVX_128bit) {
 8963       vlen_enc = Assembler::AVX_256bit;
 8964     }
 8965     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8966   %}
 8967   ins_pipe( pipe_slow );
 8968 %}
 8969 
 8970 // --------------------------------- FMA --------------------------------------
 8971 // a * b + c
 8972 
 8973 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8974   match(Set c (FmaVF  c (Binary a b)));
 8975   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8976   ins_cost(150);
 8977   ins_encode %{
 8978     assert(UseFMA, "not enabled");
 8979     int vlen_enc = vector_length_encoding(this);
 8980     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8981   %}
 8982   ins_pipe( pipe_slow );
 8983 %}
 8984 
 8985 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8986   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8987   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8988   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8989   ins_cost(150);
 8990   ins_encode %{
 8991     assert(UseFMA, "not enabled");
 8992     int vlen_enc = vector_length_encoding(this);
 8993     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8994   %}
 8995   ins_pipe( pipe_slow );
 8996 %}
 8997 
 8998 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8999   match(Set c (FmaVD  c (Binary a b)));
 9000   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9001   ins_cost(150);
 9002   ins_encode %{
 9003     assert(UseFMA, "not enabled");
 9004     int vlen_enc = vector_length_encoding(this);
 9005     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9006   %}
 9007   ins_pipe( pipe_slow );
 9008 %}
 9009 
 9010 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9011   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9012   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9013   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9014   ins_cost(150);
 9015   ins_encode %{
 9016     assert(UseFMA, "not enabled");
 9017     int vlen_enc = vector_length_encoding(this);
 9018     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9019   %}
 9020   ins_pipe( pipe_slow );
 9021 %}
 9022 
 9023 // --------------------------------- Vector Multiply Add --------------------------------------
 9024 
 9025 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9026   predicate(UseAVX == 0);
 9027   match(Set dst (MulAddVS2VI dst src1));
 9028   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9029   ins_encode %{
 9030     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9031   %}
 9032   ins_pipe( pipe_slow );
 9033 %}
 9034 
 9035 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9036   predicate(UseAVX > 0);
 9037   match(Set dst (MulAddVS2VI src1 src2));
 9038   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9039   ins_encode %{
 9040     int vlen_enc = vector_length_encoding(this);
 9041     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9042   %}
 9043   ins_pipe( pipe_slow );
 9044 %}
 9045 
 9046 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9047 
 9048 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9049   predicate(VM_Version::supports_avx512_vnni());
 9050   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9051   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9052   ins_encode %{
 9053     assert(UseAVX > 2, "required");
 9054     int vlen_enc = vector_length_encoding(this);
 9055     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9056   %}
 9057   ins_pipe( pipe_slow );
 9058   ins_cost(10);
 9059 %}
 9060 
 9061 // --------------------------------- PopCount --------------------------------------
 9062 
 9063 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9064   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9065   match(Set dst (PopCountVI src));
 9066   match(Set dst (PopCountVL src));
 9067   format %{ "vector_popcount_integral $dst, $src" %}
 9068   ins_encode %{
 9069     int opcode = this->ideal_Opcode();
 9070     int vlen_enc = vector_length_encoding(this, $src);
 9071     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9072     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9073   %}
 9074   ins_pipe( pipe_slow );
 9075 %}
 9076 
 9077 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9078   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9079   match(Set dst (PopCountVI src mask));
 9080   match(Set dst (PopCountVL src mask));
 9081   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9082   ins_encode %{
 9083     int vlen_enc = vector_length_encoding(this, $src);
 9084     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9085     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9086     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9087   %}
 9088   ins_pipe( pipe_slow );
 9089 %}
 9090 
 9091 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9092   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9093   match(Set dst (PopCountVI src));
 9094   match(Set dst (PopCountVL src));
 9095   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9096   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9097   ins_encode %{
 9098     int opcode = this->ideal_Opcode();
 9099     int vlen_enc = vector_length_encoding(this, $src);
 9100     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9101     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9102                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9103   %}
 9104   ins_pipe( pipe_slow );
 9105 %}
 9106 
 9107 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9108 
 9109 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9110   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9111                                               Matcher::vector_length_in_bytes(n->in(1))));
 9112   match(Set dst (CountTrailingZerosV src));
 9113   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9114   ins_cost(400);
 9115   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9116   ins_encode %{
 9117     int vlen_enc = vector_length_encoding(this, $src);
 9118     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9119     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9120                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9121   %}
 9122   ins_pipe( pipe_slow );
 9123 %}
 9124 
 9125 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9126   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9127             VM_Version::supports_avx512cd() &&
 9128             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9129   match(Set dst (CountTrailingZerosV src));
 9130   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9131   ins_cost(400);
 9132   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9133   ins_encode %{
 9134     int vlen_enc = vector_length_encoding(this, $src);
 9135     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9136     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9137                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9138   %}
 9139   ins_pipe( pipe_slow );
 9140 %}
 9141 
 9142 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9143   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9144   match(Set dst (CountTrailingZerosV src));
 9145   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9146   ins_cost(400);
 9147   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9148   ins_encode %{
 9149     int vlen_enc = vector_length_encoding(this, $src);
 9150     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9151     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9152                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9153                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9154   %}
 9155   ins_pipe( pipe_slow );
 9156 %}
 9157 
 9158 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9159   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9160   match(Set dst (CountTrailingZerosV src));
 9161   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9162   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9163   ins_encode %{
 9164     int vlen_enc = vector_length_encoding(this, $src);
 9165     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9166     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9167                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9168   %}
 9169   ins_pipe( pipe_slow );
 9170 %}
 9171 
 9172 
 9173 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9174 
 9175 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9176   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9177   effect(TEMP dst);
 9178   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9179   ins_encode %{
 9180     int vector_len = vector_length_encoding(this);
 9181     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9182   %}
 9183   ins_pipe( pipe_slow );
 9184 %}
 9185 
 9186 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9187   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9188   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9189   effect(TEMP dst);
 9190   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9191   ins_encode %{
 9192     int vector_len = vector_length_encoding(this);
 9193     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9194   %}
 9195   ins_pipe( pipe_slow );
 9196 %}
 9197 
 9198 // --------------------------------- Rotation Operations ----------------------------------
 9199 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9200   match(Set dst (RotateLeftV src shift));
 9201   match(Set dst (RotateRightV src shift));
 9202   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9203   ins_encode %{
 9204     int opcode      = this->ideal_Opcode();
 9205     int vector_len  = vector_length_encoding(this);
 9206     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9207     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9208   %}
 9209   ins_pipe( pipe_slow );
 9210 %}
 9211 
 9212 instruct vprorate(vec dst, vec src, vec shift) %{
 9213   match(Set dst (RotateLeftV src shift));
 9214   match(Set dst (RotateRightV src shift));
 9215   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9216   ins_encode %{
 9217     int opcode      = this->ideal_Opcode();
 9218     int vector_len  = vector_length_encoding(this);
 9219     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9220     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9221   %}
 9222   ins_pipe( pipe_slow );
 9223 %}
 9224 
 9225 // ---------------------------------- Masked Operations ------------------------------------
 9226 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9227   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9228   match(Set dst (LoadVectorMasked mem mask));
 9229   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9230   ins_encode %{
 9231     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9232     int vlen_enc = vector_length_encoding(this);
 9233     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9234   %}
 9235   ins_pipe( pipe_slow );
 9236 %}
 9237 
 9238 
 9239 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9240   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9241   match(Set dst (LoadVectorMasked mem mask));
 9242   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9243   ins_encode %{
 9244     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9245     int vector_len = vector_length_encoding(this);
 9246     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9247   %}
 9248   ins_pipe( pipe_slow );
 9249 %}
 9250 
 9251 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9252   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9253   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9254   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9255   ins_encode %{
 9256     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9257     int vlen_enc = vector_length_encoding(src_node);
 9258     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9259     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9260   %}
 9261   ins_pipe( pipe_slow );
 9262 %}
 9263 
 9264 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9265   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9266   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9267   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9268   ins_encode %{
 9269     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9270     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9271     int vlen_enc = vector_length_encoding(src_node);
 9272     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9273   %}
 9274   ins_pipe( pipe_slow );
 9275 %}
 9276 
 9277 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9278   match(Set addr (VerifyVectorAlignment addr mask));
 9279   effect(KILL cr);
 9280   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9281   ins_encode %{
 9282     Label Lskip;
 9283     // check if masked bits of addr are zero
 9284     __ testq($addr$$Register, $mask$$constant);
 9285     __ jccb(Assembler::equal, Lskip);
 9286     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9287     __ bind(Lskip);
 9288   %}
 9289   ins_pipe(pipe_slow);
 9290 %}
 9291 
 9292 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9293   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9294   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9295   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9296   ins_encode %{
 9297     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9298     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9299 
 9300     Label DONE;
 9301     int vlen_enc = vector_length_encoding(this, $src1);
 9302     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9303 
 9304     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9305     __ mov64($dst$$Register, -1L);
 9306     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9307     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9308     __ jccb(Assembler::carrySet, DONE);
 9309     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9310     __ notq($dst$$Register);
 9311     __ tzcntq($dst$$Register, $dst$$Register);
 9312     __ bind(DONE);
 9313   %}
 9314   ins_pipe( pipe_slow );
 9315 %}
 9316 
 9317 
 9318 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9319   match(Set dst (VectorMaskGen len));
 9320   effect(TEMP temp, KILL cr);
 9321   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9322   ins_encode %{
 9323     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9324   %}
 9325   ins_pipe( pipe_slow );
 9326 %}
 9327 
 9328 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9329   match(Set dst (VectorMaskGen len));
 9330   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9331   effect(TEMP temp);
 9332   ins_encode %{
 9333     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9334     __ kmovql($dst$$KRegister, $temp$$Register);
 9335   %}
 9336   ins_pipe( pipe_slow );
 9337 %}
 9338 
 9339 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9340   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9341   match(Set dst (VectorMaskToLong mask));
 9342   effect(TEMP dst, KILL cr);
 9343   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9344   ins_encode %{
 9345     int opcode = this->ideal_Opcode();
 9346     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9347     int mask_len = Matcher::vector_length(this, $mask);
 9348     int mask_size = mask_len * type2aelembytes(mbt);
 9349     int vlen_enc = vector_length_encoding(this, $mask);
 9350     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9351                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9352   %}
 9353   ins_pipe( pipe_slow );
 9354 %}
 9355 
 9356 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9357   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9358   match(Set dst (VectorMaskToLong mask));
 9359   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9360   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9361   ins_encode %{
 9362     int opcode = this->ideal_Opcode();
 9363     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9364     int mask_len = Matcher::vector_length(this, $mask);
 9365     int vlen_enc = vector_length_encoding(this, $mask);
 9366     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9367                              $dst$$Register, mask_len, mbt, vlen_enc);
 9368   %}
 9369   ins_pipe( pipe_slow );
 9370 %}
 9371 
 9372 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9373   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9374   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9375   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9376   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9377   ins_encode %{
 9378     int opcode = this->ideal_Opcode();
 9379     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9380     int mask_len = Matcher::vector_length(this, $mask);
 9381     int vlen_enc = vector_length_encoding(this, $mask);
 9382     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9383                              $dst$$Register, mask_len, mbt, vlen_enc);
 9384   %}
 9385   ins_pipe( pipe_slow );
 9386 %}
 9387 
 9388 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9389   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9390   match(Set dst (VectorMaskTrueCount mask));
 9391   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9392   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9393   ins_encode %{
 9394     int opcode = this->ideal_Opcode();
 9395     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9396     int mask_len = Matcher::vector_length(this, $mask);
 9397     int mask_size = mask_len * type2aelembytes(mbt);
 9398     int vlen_enc = vector_length_encoding(this, $mask);
 9399     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9400                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9401   %}
 9402   ins_pipe( pipe_slow );
 9403 %}
 9404 
 9405 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9406   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9407   match(Set dst (VectorMaskTrueCount mask));
 9408   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9409   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9410   ins_encode %{
 9411     int opcode = this->ideal_Opcode();
 9412     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9413     int mask_len = Matcher::vector_length(this, $mask);
 9414     int vlen_enc = vector_length_encoding(this, $mask);
 9415     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9416                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9417   %}
 9418   ins_pipe( pipe_slow );
 9419 %}
 9420 
 9421 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9422   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9423   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9424   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9425   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9426   ins_encode %{
 9427     int opcode = this->ideal_Opcode();
 9428     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9429     int mask_len = Matcher::vector_length(this, $mask);
 9430     int vlen_enc = vector_length_encoding(this, $mask);
 9431     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9432                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9433   %}
 9434   ins_pipe( pipe_slow );
 9435 %}
 9436 
 9437 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9438   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9439   match(Set dst (VectorMaskFirstTrue mask));
 9440   match(Set dst (VectorMaskLastTrue mask));
 9441   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9442   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9443   ins_encode %{
 9444     int opcode = this->ideal_Opcode();
 9445     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9446     int mask_len = Matcher::vector_length(this, $mask);
 9447     int mask_size = mask_len * type2aelembytes(mbt);
 9448     int vlen_enc = vector_length_encoding(this, $mask);
 9449     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9450                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9451   %}
 9452   ins_pipe( pipe_slow );
 9453 %}
 9454 
 9455 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9456   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9457   match(Set dst (VectorMaskFirstTrue mask));
 9458   match(Set dst (VectorMaskLastTrue mask));
 9459   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9460   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9461   ins_encode %{
 9462     int opcode = this->ideal_Opcode();
 9463     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9464     int mask_len = Matcher::vector_length(this, $mask);
 9465     int vlen_enc = vector_length_encoding(this, $mask);
 9466     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9467                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9468   %}
 9469   ins_pipe( pipe_slow );
 9470 %}
 9471 
 9472 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9473   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9474   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9475   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9476   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9477   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9478   ins_encode %{
 9479     int opcode = this->ideal_Opcode();
 9480     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9481     int mask_len = Matcher::vector_length(this, $mask);
 9482     int vlen_enc = vector_length_encoding(this, $mask);
 9483     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9484                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9485   %}
 9486   ins_pipe( pipe_slow );
 9487 %}
 9488 
 9489 // --------------------------------- Compress/Expand Operations ---------------------------
 9490 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9491   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9492   match(Set dst (CompressV src mask));
 9493   match(Set dst (ExpandV src mask));
 9494   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9495   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9496   ins_encode %{
 9497     int opcode = this->ideal_Opcode();
 9498     int vlen_enc = vector_length_encoding(this);
 9499     BasicType bt  = Matcher::vector_element_basic_type(this);
 9500     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9501                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9502   %}
 9503   ins_pipe( pipe_slow );
 9504 %}
 9505 
 9506 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9507   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9508   match(Set dst (CompressV src mask));
 9509   match(Set dst (ExpandV src mask));
 9510   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9511   ins_encode %{
 9512     int opcode = this->ideal_Opcode();
 9513     int vector_len = vector_length_encoding(this);
 9514     BasicType bt  = Matcher::vector_element_basic_type(this);
 9515     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9516   %}
 9517   ins_pipe( pipe_slow );
 9518 %}
 9519 
 9520 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9521   match(Set dst (CompressM mask));
 9522   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9523   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9524   ins_encode %{
 9525     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9526     int mask_len = Matcher::vector_length(this);
 9527     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9528   %}
 9529   ins_pipe( pipe_slow );
 9530 %}
 9531 
 9532 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9533 
 9534 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9535   predicate(!VM_Version::supports_gfni());
 9536   match(Set dst (ReverseV src));
 9537   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9538   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9539   ins_encode %{
 9540     int vec_enc = vector_length_encoding(this);
 9541     BasicType bt = Matcher::vector_element_basic_type(this);
 9542     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9543                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9544   %}
 9545   ins_pipe( pipe_slow );
 9546 %}
 9547 
 9548 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9549   predicate(VM_Version::supports_gfni());
 9550   match(Set dst (ReverseV src));
 9551   effect(TEMP dst, TEMP xtmp);
 9552   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9553   ins_encode %{
 9554     int vec_enc = vector_length_encoding(this);
 9555     BasicType bt  = Matcher::vector_element_basic_type(this);
 9556     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9557     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9558                                $xtmp$$XMMRegister);
 9559   %}
 9560   ins_pipe( pipe_slow );
 9561 %}
 9562 
 9563 instruct vreverse_byte_reg(vec dst, vec src) %{
 9564   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9565   match(Set dst (ReverseBytesV src));
 9566   effect(TEMP dst);
 9567   format %{ "vector_reverse_byte $dst, $src" %}
 9568   ins_encode %{
 9569     int vec_enc = vector_length_encoding(this);
 9570     BasicType bt = Matcher::vector_element_basic_type(this);
 9571     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9572   %}
 9573   ins_pipe( pipe_slow );
 9574 %}
 9575 
 9576 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9577   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9578   match(Set dst (ReverseBytesV src));
 9579   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9580   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9581   ins_encode %{
 9582     int vec_enc = vector_length_encoding(this);
 9583     BasicType bt = Matcher::vector_element_basic_type(this);
 9584     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9585                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9586   %}
 9587   ins_pipe( pipe_slow );
 9588 %}
 9589 
 9590 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9591 
 9592 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9593   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9594                                               Matcher::vector_length_in_bytes(n->in(1))));
 9595   match(Set dst (CountLeadingZerosV src));
 9596   format %{ "vector_count_leading_zeros $dst, $src" %}
 9597   ins_encode %{
 9598      int vlen_enc = vector_length_encoding(this, $src);
 9599      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9600      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9601                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9602   %}
 9603   ins_pipe( pipe_slow );
 9604 %}
 9605 
 9606 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9607   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9608                                               Matcher::vector_length_in_bytes(n->in(1))));
 9609   match(Set dst (CountLeadingZerosV src mask));
 9610   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9611   ins_encode %{
 9612     int vlen_enc = vector_length_encoding(this, $src);
 9613     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9614     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9615     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9616                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9617   %}
 9618   ins_pipe( pipe_slow );
 9619 %}
 9620 
 9621 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9622   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9623             VM_Version::supports_avx512cd() &&
 9624             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9625   match(Set dst (CountLeadingZerosV src));
 9626   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9627   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9628   ins_encode %{
 9629     int vlen_enc = vector_length_encoding(this, $src);
 9630     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9631     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9632                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9633   %}
 9634   ins_pipe( pipe_slow );
 9635 %}
 9636 
 9637 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9638   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9639   match(Set dst (CountLeadingZerosV src));
 9640   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9641   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9642   ins_encode %{
 9643     int vlen_enc = vector_length_encoding(this, $src);
 9644     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9645     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9646                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9647                                        $rtmp$$Register, true, vlen_enc);
 9648   %}
 9649   ins_pipe( pipe_slow );
 9650 %}
 9651 
 9652 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9653   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9654             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9655   match(Set dst (CountLeadingZerosV src));
 9656   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9657   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9658   ins_encode %{
 9659     int vlen_enc = vector_length_encoding(this, $src);
 9660     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9661     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9662                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9663   %}
 9664   ins_pipe( pipe_slow );
 9665 %}
 9666 
 9667 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9668   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9669             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9670   match(Set dst (CountLeadingZerosV src));
 9671   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9672   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9673   ins_encode %{
 9674     int vlen_enc = vector_length_encoding(this, $src);
 9675     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9676     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9677                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9678   %}
 9679   ins_pipe( pipe_slow );
 9680 %}
 9681 
 9682 // ---------------------------------- Vector Masked Operations ------------------------------------
 9683 
 9684 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9685   match(Set dst (AddVB (Binary dst src2) mask));
 9686   match(Set dst (AddVS (Binary dst src2) mask));
 9687   match(Set dst (AddVI (Binary dst src2) mask));
 9688   match(Set dst (AddVL (Binary dst src2) mask));
 9689   match(Set dst (AddVF (Binary dst src2) mask));
 9690   match(Set dst (AddVD (Binary dst src2) mask));
 9691   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9692   ins_encode %{
 9693     int vlen_enc = vector_length_encoding(this);
 9694     BasicType bt = Matcher::vector_element_basic_type(this);
 9695     int opc = this->ideal_Opcode();
 9696     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9697                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9698   %}
 9699   ins_pipe( pipe_slow );
 9700 %}
 9701 
 9702 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9703   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9704   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9705   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9706   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9707   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9708   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9709   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9710   ins_encode %{
 9711     int vlen_enc = vector_length_encoding(this);
 9712     BasicType bt = Matcher::vector_element_basic_type(this);
 9713     int opc = this->ideal_Opcode();
 9714     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9715                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9716   %}
 9717   ins_pipe( pipe_slow );
 9718 %}
 9719 
 9720 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9721   match(Set dst (XorV (Binary dst src2) mask));
 9722   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9723   ins_encode %{
 9724     int vlen_enc = vector_length_encoding(this);
 9725     BasicType bt = Matcher::vector_element_basic_type(this);
 9726     int opc = this->ideal_Opcode();
 9727     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9728                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9729   %}
 9730   ins_pipe( pipe_slow );
 9731 %}
 9732 
 9733 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9734   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9735   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9736   ins_encode %{
 9737     int vlen_enc = vector_length_encoding(this);
 9738     BasicType bt = Matcher::vector_element_basic_type(this);
 9739     int opc = this->ideal_Opcode();
 9740     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9741                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9742   %}
 9743   ins_pipe( pipe_slow );
 9744 %}
 9745 
 9746 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9747   match(Set dst (OrV (Binary dst src2) mask));
 9748   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9749   ins_encode %{
 9750     int vlen_enc = vector_length_encoding(this);
 9751     BasicType bt = Matcher::vector_element_basic_type(this);
 9752     int opc = this->ideal_Opcode();
 9753     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9754                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9755   %}
 9756   ins_pipe( pipe_slow );
 9757 %}
 9758 
 9759 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9760   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9761   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9762   ins_encode %{
 9763     int vlen_enc = vector_length_encoding(this);
 9764     BasicType bt = Matcher::vector_element_basic_type(this);
 9765     int opc = this->ideal_Opcode();
 9766     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9767                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9768   %}
 9769   ins_pipe( pipe_slow );
 9770 %}
 9771 
 9772 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9773   match(Set dst (AndV (Binary dst src2) mask));
 9774   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9775   ins_encode %{
 9776     int vlen_enc = vector_length_encoding(this);
 9777     BasicType bt = Matcher::vector_element_basic_type(this);
 9778     int opc = this->ideal_Opcode();
 9779     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9780                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9781   %}
 9782   ins_pipe( pipe_slow );
 9783 %}
 9784 
 9785 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9786   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9787   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9788   ins_encode %{
 9789     int vlen_enc = vector_length_encoding(this);
 9790     BasicType bt = Matcher::vector_element_basic_type(this);
 9791     int opc = this->ideal_Opcode();
 9792     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9793                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9794   %}
 9795   ins_pipe( pipe_slow );
 9796 %}
 9797 
 9798 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9799   match(Set dst (SubVB (Binary dst src2) mask));
 9800   match(Set dst (SubVS (Binary dst src2) mask));
 9801   match(Set dst (SubVI (Binary dst src2) mask));
 9802   match(Set dst (SubVL (Binary dst src2) mask));
 9803   match(Set dst (SubVF (Binary dst src2) mask));
 9804   match(Set dst (SubVD (Binary dst src2) mask));
 9805   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9806   ins_encode %{
 9807     int vlen_enc = vector_length_encoding(this);
 9808     BasicType bt = Matcher::vector_element_basic_type(this);
 9809     int opc = this->ideal_Opcode();
 9810     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9811                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9812   %}
 9813   ins_pipe( pipe_slow );
 9814 %}
 9815 
 9816 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9817   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9818   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9819   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9820   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9821   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9822   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9823   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9824   ins_encode %{
 9825     int vlen_enc = vector_length_encoding(this);
 9826     BasicType bt = Matcher::vector_element_basic_type(this);
 9827     int opc = this->ideal_Opcode();
 9828     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9829                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9830   %}
 9831   ins_pipe( pipe_slow );
 9832 %}
 9833 
 9834 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9835   match(Set dst (MulVS (Binary dst src2) mask));
 9836   match(Set dst (MulVI (Binary dst src2) mask));
 9837   match(Set dst (MulVL (Binary dst src2) mask));
 9838   match(Set dst (MulVF (Binary dst src2) mask));
 9839   match(Set dst (MulVD (Binary dst src2) mask));
 9840   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9841   ins_encode %{
 9842     int vlen_enc = vector_length_encoding(this);
 9843     BasicType bt = Matcher::vector_element_basic_type(this);
 9844     int opc = this->ideal_Opcode();
 9845     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9846                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9847   %}
 9848   ins_pipe( pipe_slow );
 9849 %}
 9850 
 9851 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9852   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9853   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9854   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9855   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9856   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9857   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9858   ins_encode %{
 9859     int vlen_enc = vector_length_encoding(this);
 9860     BasicType bt = Matcher::vector_element_basic_type(this);
 9861     int opc = this->ideal_Opcode();
 9862     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9863                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9864   %}
 9865   ins_pipe( pipe_slow );
 9866 %}
 9867 
 9868 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9869   match(Set dst (SqrtVF dst mask));
 9870   match(Set dst (SqrtVD dst mask));
 9871   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9872   ins_encode %{
 9873     int vlen_enc = vector_length_encoding(this);
 9874     BasicType bt = Matcher::vector_element_basic_type(this);
 9875     int opc = this->ideal_Opcode();
 9876     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9877                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9878   %}
 9879   ins_pipe( pipe_slow );
 9880 %}
 9881 
 9882 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9883   match(Set dst (DivVF (Binary dst src2) mask));
 9884   match(Set dst (DivVD (Binary dst src2) mask));
 9885   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9886   ins_encode %{
 9887     int vlen_enc = vector_length_encoding(this);
 9888     BasicType bt = Matcher::vector_element_basic_type(this);
 9889     int opc = this->ideal_Opcode();
 9890     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9891                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9892   %}
 9893   ins_pipe( pipe_slow );
 9894 %}
 9895 
 9896 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9897   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9898   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9899   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9900   ins_encode %{
 9901     int vlen_enc = vector_length_encoding(this);
 9902     BasicType bt = Matcher::vector_element_basic_type(this);
 9903     int opc = this->ideal_Opcode();
 9904     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9905                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9906   %}
 9907   ins_pipe( pipe_slow );
 9908 %}
 9909 
 9910 
 9911 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9912   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9913   match(Set dst (RotateRightV (Binary dst shift) mask));
 9914   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9915   ins_encode %{
 9916     int vlen_enc = vector_length_encoding(this);
 9917     BasicType bt = Matcher::vector_element_basic_type(this);
 9918     int opc = this->ideal_Opcode();
 9919     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9920                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9921   %}
 9922   ins_pipe( pipe_slow );
 9923 %}
 9924 
 9925 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9926   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9927   match(Set dst (RotateRightV (Binary dst src2) mask));
 9928   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9929   ins_encode %{
 9930     int vlen_enc = vector_length_encoding(this);
 9931     BasicType bt = Matcher::vector_element_basic_type(this);
 9932     int opc = this->ideal_Opcode();
 9933     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9934                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9935   %}
 9936   ins_pipe( pipe_slow );
 9937 %}
 9938 
 9939 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9940   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9941   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9942   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9943   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9944   ins_encode %{
 9945     int vlen_enc = vector_length_encoding(this);
 9946     BasicType bt = Matcher::vector_element_basic_type(this);
 9947     int opc = this->ideal_Opcode();
 9948     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9949                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9950   %}
 9951   ins_pipe( pipe_slow );
 9952 %}
 9953 
 9954 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9955   predicate(!n->as_ShiftV()->is_var_shift());
 9956   match(Set dst (LShiftVS (Binary dst src2) mask));
 9957   match(Set dst (LShiftVI (Binary dst src2) mask));
 9958   match(Set dst (LShiftVL (Binary dst src2) mask));
 9959   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9960   ins_encode %{
 9961     int vlen_enc = vector_length_encoding(this);
 9962     BasicType bt = Matcher::vector_element_basic_type(this);
 9963     int opc = this->ideal_Opcode();
 9964     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9965                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9966   %}
 9967   ins_pipe( pipe_slow );
 9968 %}
 9969 
 9970 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9971   predicate(n->as_ShiftV()->is_var_shift());
 9972   match(Set dst (LShiftVS (Binary dst src2) mask));
 9973   match(Set dst (LShiftVI (Binary dst src2) mask));
 9974   match(Set dst (LShiftVL (Binary dst src2) mask));
 9975   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9976   ins_encode %{
 9977     int vlen_enc = vector_length_encoding(this);
 9978     BasicType bt = Matcher::vector_element_basic_type(this);
 9979     int opc = this->ideal_Opcode();
 9980     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9981                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9982   %}
 9983   ins_pipe( pipe_slow );
 9984 %}
 9985 
 9986 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9987   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9988   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9989   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9990   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9991   ins_encode %{
 9992     int vlen_enc = vector_length_encoding(this);
 9993     BasicType bt = Matcher::vector_element_basic_type(this);
 9994     int opc = this->ideal_Opcode();
 9995     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9996                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9997   %}
 9998   ins_pipe( pipe_slow );
 9999 %}
10000 
10001 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10002   predicate(!n->as_ShiftV()->is_var_shift());
10003   match(Set dst (RShiftVS (Binary dst src2) mask));
10004   match(Set dst (RShiftVI (Binary dst src2) mask));
10005   match(Set dst (RShiftVL (Binary dst src2) mask));
10006   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10007   ins_encode %{
10008     int vlen_enc = vector_length_encoding(this);
10009     BasicType bt = Matcher::vector_element_basic_type(this);
10010     int opc = this->ideal_Opcode();
10011     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10012                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10013   %}
10014   ins_pipe( pipe_slow );
10015 %}
10016 
10017 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10018   predicate(n->as_ShiftV()->is_var_shift());
10019   match(Set dst (RShiftVS (Binary dst src2) mask));
10020   match(Set dst (RShiftVI (Binary dst src2) mask));
10021   match(Set dst (RShiftVL (Binary dst src2) mask));
10022   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10023   ins_encode %{
10024     int vlen_enc = vector_length_encoding(this);
10025     BasicType bt = Matcher::vector_element_basic_type(this);
10026     int opc = this->ideal_Opcode();
10027     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10028                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10029   %}
10030   ins_pipe( pipe_slow );
10031 %}
10032 
10033 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10034   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10035   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10036   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10037   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10038   ins_encode %{
10039     int vlen_enc = vector_length_encoding(this);
10040     BasicType bt = Matcher::vector_element_basic_type(this);
10041     int opc = this->ideal_Opcode();
10042     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10043                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10044   %}
10045   ins_pipe( pipe_slow );
10046 %}
10047 
10048 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10049   predicate(!n->as_ShiftV()->is_var_shift());
10050   match(Set dst (URShiftVS (Binary dst src2) mask));
10051   match(Set dst (URShiftVI (Binary dst src2) mask));
10052   match(Set dst (URShiftVL (Binary dst src2) mask));
10053   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10054   ins_encode %{
10055     int vlen_enc = vector_length_encoding(this);
10056     BasicType bt = Matcher::vector_element_basic_type(this);
10057     int opc = this->ideal_Opcode();
10058     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10059                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10060   %}
10061   ins_pipe( pipe_slow );
10062 %}
10063 
10064 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10065   predicate(n->as_ShiftV()->is_var_shift());
10066   match(Set dst (URShiftVS (Binary dst src2) mask));
10067   match(Set dst (URShiftVI (Binary dst src2) mask));
10068   match(Set dst (URShiftVL (Binary dst src2) mask));
10069   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10070   ins_encode %{
10071     int vlen_enc = vector_length_encoding(this);
10072     BasicType bt = Matcher::vector_element_basic_type(this);
10073     int opc = this->ideal_Opcode();
10074     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10075                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10076   %}
10077   ins_pipe( pipe_slow );
10078 %}
10079 
10080 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10081   match(Set dst (MaxV (Binary dst src2) mask));
10082   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10083   ins_encode %{
10084     int vlen_enc = vector_length_encoding(this);
10085     BasicType bt = Matcher::vector_element_basic_type(this);
10086     int opc = this->ideal_Opcode();
10087     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10088                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10089   %}
10090   ins_pipe( pipe_slow );
10091 %}
10092 
10093 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10094   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10095   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10096   ins_encode %{
10097     int vlen_enc = vector_length_encoding(this);
10098     BasicType bt = Matcher::vector_element_basic_type(this);
10099     int opc = this->ideal_Opcode();
10100     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10101                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10102   %}
10103   ins_pipe( pipe_slow );
10104 %}
10105 
10106 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10107   match(Set dst (MinV (Binary dst src2) mask));
10108   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10109   ins_encode %{
10110     int vlen_enc = vector_length_encoding(this);
10111     BasicType bt = Matcher::vector_element_basic_type(this);
10112     int opc = this->ideal_Opcode();
10113     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10114                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10115   %}
10116   ins_pipe( pipe_slow );
10117 %}
10118 
10119 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10120   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10121   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10122   ins_encode %{
10123     int vlen_enc = vector_length_encoding(this);
10124     BasicType bt = Matcher::vector_element_basic_type(this);
10125     int opc = this->ideal_Opcode();
10126     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10127                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10128   %}
10129   ins_pipe( pipe_slow );
10130 %}
10131 
10132 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10133   match(Set dst (VectorRearrange (Binary dst src2) mask));
10134   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10135   ins_encode %{
10136     int vlen_enc = vector_length_encoding(this);
10137     BasicType bt = Matcher::vector_element_basic_type(this);
10138     int opc = this->ideal_Opcode();
10139     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10140                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10141   %}
10142   ins_pipe( pipe_slow );
10143 %}
10144 
10145 instruct vabs_masked(vec dst, kReg mask) %{
10146   match(Set dst (AbsVB dst mask));
10147   match(Set dst (AbsVS dst mask));
10148   match(Set dst (AbsVI dst mask));
10149   match(Set dst (AbsVL dst mask));
10150   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10151   ins_encode %{
10152     int vlen_enc = vector_length_encoding(this);
10153     BasicType bt = Matcher::vector_element_basic_type(this);
10154     int opc = this->ideal_Opcode();
10155     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10156                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10157   %}
10158   ins_pipe( pipe_slow );
10159 %}
10160 
10161 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10162   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10163   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10164   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10165   ins_encode %{
10166     assert(UseFMA, "Needs FMA instructions support.");
10167     int vlen_enc = vector_length_encoding(this);
10168     BasicType bt = Matcher::vector_element_basic_type(this);
10169     int opc = this->ideal_Opcode();
10170     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10171                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10172   %}
10173   ins_pipe( pipe_slow );
10174 %}
10175 
10176 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10177   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10178   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10179   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10180   ins_encode %{
10181     assert(UseFMA, "Needs FMA instructions support.");
10182     int vlen_enc = vector_length_encoding(this);
10183     BasicType bt = Matcher::vector_element_basic_type(this);
10184     int opc = this->ideal_Opcode();
10185     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10186                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10187   %}
10188   ins_pipe( pipe_slow );
10189 %}
10190 
10191 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10192   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10193   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10194   ins_encode %{
10195     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10196     int vlen_enc = vector_length_encoding(this, $src1);
10197     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10198 
10199     // Comparison i
10200     switch (src1_elem_bt) {
10201       case T_BYTE: {
10202         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10203         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10204         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10205         break;
10206       }
10207       case T_SHORT: {
10208         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10209         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10210         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10211         break;
10212       }
10213       case T_INT: {
10214         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10215         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10216         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10217         break;
10218       }
10219       case T_LONG: {
10220         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10221         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10222         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10223         break;
10224       }
10225       case T_FLOAT: {
10226         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10227         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10228         break;
10229       }
10230       case T_DOUBLE: {
10231         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10232         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10233         break;
10234       }
10235       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10236     }
10237   %}
10238   ins_pipe( pipe_slow );
10239 %}
10240 
10241 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10242   predicate(Matcher::vector_length(n) <= 32);
10243   match(Set dst (MaskAll src));
10244   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10245   ins_encode %{
10246     int mask_len = Matcher::vector_length(this);
10247     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10248   %}
10249   ins_pipe( pipe_slow );
10250 %}
10251 
10252 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10253   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10254   match(Set dst (XorVMask src (MaskAll cnt)));
10255   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10256   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10257   ins_encode %{
10258     uint masklen = Matcher::vector_length(this);
10259     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10260   %}
10261   ins_pipe( pipe_slow );
10262 %}
10263 
10264 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10265   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10266             (Matcher::vector_length(n) == 16) ||
10267             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10268   match(Set dst (XorVMask src (MaskAll cnt)));
10269   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10270   ins_encode %{
10271     uint masklen = Matcher::vector_length(this);
10272     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10273   %}
10274   ins_pipe( pipe_slow );
10275 %}
10276 
10277 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10278   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10279   match(Set dst (VectorLongToMask src));
10280   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10281   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10282   ins_encode %{
10283     int mask_len = Matcher::vector_length(this);
10284     int vec_enc  = vector_length_encoding(mask_len);
10285     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10286                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10287   %}
10288   ins_pipe( pipe_slow );
10289 %}
10290 
10291 
10292 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10293   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10294   match(Set dst (VectorLongToMask src));
10295   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10296   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10297   ins_encode %{
10298     int mask_len = Matcher::vector_length(this);
10299     assert(mask_len <= 32, "invalid mask length");
10300     int vec_enc  = vector_length_encoding(mask_len);
10301     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10302                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10303   %}
10304   ins_pipe( pipe_slow );
10305 %}
10306 
10307 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10308   predicate(n->bottom_type()->isa_vectmask());
10309   match(Set dst (VectorLongToMask src));
10310   format %{ "long_to_mask_evex $dst, $src\t!" %}
10311   ins_encode %{
10312     __ kmov($dst$$KRegister, $src$$Register);
10313   %}
10314   ins_pipe( pipe_slow );
10315 %}
10316 
10317 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10318   match(Set dst (AndVMask src1 src2));
10319   match(Set dst (OrVMask src1 src2));
10320   match(Set dst (XorVMask src1 src2));
10321   effect(TEMP kscratch);
10322   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10323   ins_encode %{
10324     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10325     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10326     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10327     uint masklen = Matcher::vector_length(this);
10328     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10329     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10330   %}
10331   ins_pipe( pipe_slow );
10332 %}
10333 
10334 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10335   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10336   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10337   ins_encode %{
10338     int vlen_enc = vector_length_encoding(this);
10339     BasicType bt = Matcher::vector_element_basic_type(this);
10340     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10341                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10342   %}
10343   ins_pipe( pipe_slow );
10344 %}
10345 
10346 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10347   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10348   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10349   ins_encode %{
10350     int vlen_enc = vector_length_encoding(this);
10351     BasicType bt = Matcher::vector_element_basic_type(this);
10352     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10353                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10354   %}
10355   ins_pipe( pipe_slow );
10356 %}
10357 
10358 instruct castMM(kReg dst)
10359 %{
10360   match(Set dst (CastVV dst));
10361 
10362   size(0);
10363   format %{ "# castVV of $dst" %}
10364   ins_encode(/* empty encoding */);
10365   ins_cost(0);
10366   ins_pipe(empty);
10367 %}
10368 
10369 instruct castVV(vec dst)
10370 %{
10371   match(Set dst (CastVV dst));
10372 
10373   size(0);
10374   format %{ "# castVV of $dst" %}
10375   ins_encode(/* empty encoding */);
10376   ins_cost(0);
10377   ins_pipe(empty);
10378 %}
10379 
10380 instruct castVVLeg(legVec dst)
10381 %{
10382   match(Set dst (CastVV dst));
10383 
10384   size(0);
10385   format %{ "# castVV of $dst" %}
10386   ins_encode(/* empty encoding */);
10387   ins_cost(0);
10388   ins_pipe(empty);
10389 %}
10390 
10391 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10392 %{
10393   match(Set dst (IsInfiniteF src));
10394   effect(TEMP ktmp, KILL cr);
10395   format %{ "float_class_check $dst, $src" %}
10396   ins_encode %{
10397     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10398     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10399   %}
10400   ins_pipe(pipe_slow);
10401 %}
10402 
10403 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10404 %{
10405   match(Set dst (IsInfiniteD src));
10406   effect(TEMP ktmp, KILL cr);
10407   format %{ "double_class_check $dst, $src" %}
10408   ins_encode %{
10409     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10410     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10411   %}
10412   ins_pipe(pipe_slow);
10413 %}
10414 
10415 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10416 %{
10417   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10418             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10419   match(Set dst (SaturatingAddV src1 src2));
10420   match(Set dst (SaturatingSubV src1 src2));
10421   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10422   ins_encode %{
10423     int vlen_enc = vector_length_encoding(this);
10424     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10425     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10426                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10427   %}
10428   ins_pipe(pipe_slow);
10429 %}
10430 
10431 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10432 %{
10433   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10434             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10435   match(Set dst (SaturatingAddV src1 src2));
10436   match(Set dst (SaturatingSubV src1 src2));
10437   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10438   ins_encode %{
10439     int vlen_enc = vector_length_encoding(this);
10440     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10441     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10442                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10443   %}
10444   ins_pipe(pipe_slow);
10445 %}
10446 
10447 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10448 %{
10449   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10450             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10451             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10452   match(Set dst (SaturatingAddV src1 src2));
10453   match(Set dst (SaturatingSubV src1 src2));
10454   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10455   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10456   ins_encode %{
10457     int vlen_enc = vector_length_encoding(this);
10458     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10459     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10460                                         $src1$$XMMRegister, $src2$$XMMRegister,
10461                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10462                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10463   %}
10464   ins_pipe(pipe_slow);
10465 %}
10466 
10467 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10468 %{
10469   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10470             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10471             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10472   match(Set dst (SaturatingAddV src1 src2));
10473   match(Set dst (SaturatingSubV src1 src2));
10474   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10475   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10476   ins_encode %{
10477     int vlen_enc = vector_length_encoding(this);
10478     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10479     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10480                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10481                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10482   %}
10483   ins_pipe(pipe_slow);
10484 %}
10485 
10486 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10487 %{
10488   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10489             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10490             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10491   match(Set dst (SaturatingAddV src1 src2));
10492   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10493   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10494   ins_encode %{
10495     int vlen_enc = vector_length_encoding(this);
10496     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10497     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10498                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10499   %}
10500   ins_pipe(pipe_slow);
10501 %}
10502 
10503 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10504 %{
10505   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10506             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10507             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10508   match(Set dst (SaturatingAddV src1 src2));
10509   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10510   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10511   ins_encode %{
10512     int vlen_enc = vector_length_encoding(this);
10513     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10514     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10515                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10516   %}
10517   ins_pipe(pipe_slow);
10518 %}
10519 
10520 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10521 %{
10522   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10523             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10524             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10525   match(Set dst (SaturatingSubV src1 src2));
10526   effect(TEMP ktmp);
10527   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10528   ins_encode %{
10529     int vlen_enc = vector_length_encoding(this);
10530     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10531     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10532                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10533   %}
10534   ins_pipe(pipe_slow);
10535 %}
10536 
10537 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10538 %{
10539   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10540             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10541             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10542   match(Set dst (SaturatingSubV src1 src2));
10543   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10544   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10545   ins_encode %{
10546     int vlen_enc = vector_length_encoding(this);
10547     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10548     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10549                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10550   %}
10551   ins_pipe(pipe_slow);
10552 %}
10553 
10554 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10555 %{
10556   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10557             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10558   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10559   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10560   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10561   ins_encode %{
10562     int vlen_enc = vector_length_encoding(this);
10563     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10564     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10565                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10566   %}
10567   ins_pipe(pipe_slow);
10568 %}
10569 
10570 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10571 %{
10572   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10573             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10574   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10575   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10576   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10577   ins_encode %{
10578     int vlen_enc = vector_length_encoding(this);
10579     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10580     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10581                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10582   %}
10583   ins_pipe(pipe_slow);
10584 %}
10585 
10586 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10587   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10588             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10589   match(Set dst (SaturatingAddV (Binary dst src) mask));
10590   match(Set dst (SaturatingSubV (Binary dst src) mask));
10591   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10592   ins_encode %{
10593     int vlen_enc = vector_length_encoding(this);
10594     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10595     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10596                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10597   %}
10598   ins_pipe( pipe_slow );
10599 %}
10600 
10601 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10602   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10603             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10604   match(Set dst (SaturatingAddV (Binary dst src) mask));
10605   match(Set dst (SaturatingSubV (Binary dst src) mask));
10606   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10607   ins_encode %{
10608     int vlen_enc = vector_length_encoding(this);
10609     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10610     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10611                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10612   %}
10613   ins_pipe( pipe_slow );
10614 %}
10615 
10616 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10617   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10618             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10619   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10620   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10621   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10622   ins_encode %{
10623     int vlen_enc = vector_length_encoding(this);
10624     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10625     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10626                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10627   %}
10628   ins_pipe( pipe_slow );
10629 %}
10630 
10631 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10632   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10633             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10634   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10635   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10636   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10637   ins_encode %{
10638     int vlen_enc = vector_length_encoding(this);
10639     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10640     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10641                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10642   %}
10643   ins_pipe( pipe_slow );
10644 %}
10645 
10646 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10647 %{
10648   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10649   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10650   ins_encode %{
10651     int vlen_enc = vector_length_encoding(this);
10652     BasicType bt = Matcher::vector_element_basic_type(this);
10653     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10654   %}
10655   ins_pipe(pipe_slow);
10656 %}
10657 
10658 instruct reinterpretS2HF(regF dst, rRegI src)
10659 %{
10660   match(Set dst (ReinterpretS2HF src));
10661   format %{ "vmovw $dst, $src" %}
10662   ins_encode %{
10663     __ vmovw($dst$$XMMRegister, $src$$Register);
10664   %}
10665   ins_pipe(pipe_slow);
10666 %}
10667 
10668 instruct reinterpretHF2S(rRegI dst, regF src)
10669 %{
10670   match(Set dst (ReinterpretHF2S src));
10671   format %{ "vmovw $dst, $src" %}
10672   ins_encode %{
10673     __ vmovw($dst$$Register, $src$$XMMRegister);
10674   %}
10675   ins_pipe(pipe_slow);
10676 %}
10677 
10678 instruct convF2HFAndS2HF(regF dst, regF src)
10679 %{
10680   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10681   format %{ "convF2HFAndS2HF $dst, $src" %}
10682   ins_encode %{
10683     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10684   %}
10685   ins_pipe(pipe_slow);
10686 %}
10687 
10688 instruct convHF2SAndHF2F(regF dst, regF src)
10689 %{
10690   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10691   format %{ "convHF2SAndHF2F $dst, $src" %}
10692   ins_encode %{
10693     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10694   %}
10695   ins_pipe(pipe_slow);
10696 %}
10697 
10698 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10699 %{
10700   match(Set dst (SqrtHF src));
10701   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10702   ins_encode %{
10703     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10704   %}
10705   ins_pipe(pipe_slow);
10706 %}
10707 
10708 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10709 %{
10710   match(Set dst (AddHF src1 src2));
10711   match(Set dst (DivHF src1 src2));
10712   match(Set dst (MulHF src1 src2));
10713   match(Set dst (SubHF src1 src2));
10714   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10715   ins_encode %{
10716     int opcode = this->ideal_Opcode();
10717     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10718   %}
10719   ins_pipe(pipe_slow);
10720 %}
10721 
10722 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10723 %{
10724   match(Set dst (MaxHF src1 src2));
10725   match(Set dst (MinHF src1 src2));
10726   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10727   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10728   ins_encode %{
10729     int opcode = this->ideal_Opcode();
10730     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10731                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10732   %}
10733   ins_pipe( pipe_slow );
10734 %}
10735 
10736 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10737 %{
10738   match(Set dst (FmaHF  src2 (Binary dst src1)));
10739   effect(DEF dst);
10740   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10741   ins_encode %{
10742     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10743   %}
10744   ins_pipe( pipe_slow );
10745 %}
10746 
10747 
10748 instruct vector_sqrt_HF_reg(vec dst, vec src)
10749 %{
10750   match(Set dst (SqrtVHF src));
10751   format %{ "vector_sqrt_fp16 $dst, $src" %}
10752   ins_encode %{
10753     int vlen_enc = vector_length_encoding(this);
10754     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10755   %}
10756   ins_pipe(pipe_slow);
10757 %}
10758 
10759 instruct vector_sqrt_HF_mem(vec dst, memory src)
10760 %{
10761   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10762   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10763   ins_encode %{
10764     int vlen_enc = vector_length_encoding(this);
10765     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10766   %}
10767   ins_pipe(pipe_slow);
10768 %}
10769 
10770 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10771 %{
10772   match(Set dst (AddVHF src1 src2));
10773   match(Set dst (DivVHF src1 src2));
10774   match(Set dst (MulVHF src1 src2));
10775   match(Set dst (SubVHF src1 src2));
10776   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10777   ins_encode %{
10778     int vlen_enc = vector_length_encoding(this);
10779     int opcode = this->ideal_Opcode();
10780     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10781   %}
10782   ins_pipe(pipe_slow);
10783 %}
10784 
10785 
10786 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10787 %{
10788   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10789   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10790   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10791   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10792   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10793   ins_encode %{
10794     int vlen_enc = vector_length_encoding(this);
10795     int opcode = this->ideal_Opcode();
10796     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10797   %}
10798   ins_pipe(pipe_slow);
10799 %}
10800 
10801 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10802 %{
10803   match(Set dst (FmaVHF src2 (Binary dst src1)));
10804   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10805   ins_encode %{
10806     int vlen_enc = vector_length_encoding(this);
10807     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10808   %}
10809   ins_pipe( pipe_slow );
10810 %}
10811 
10812 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10813 %{
10814   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10815   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10816   ins_encode %{
10817     int vlen_enc = vector_length_encoding(this);
10818     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10819   %}
10820   ins_pipe( pipe_slow );
10821 %}
10822 
10823 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
10824 %{
10825   match(Set dst (MinVHF src1 src2));
10826   match(Set dst (MaxVHF src1 src2));
10827   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10828   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10829   ins_encode %{
10830     int vlen_enc = vector_length_encoding(this);
10831     int opcode = this->ideal_Opcode();
10832     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
10833                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10834   %}
10835   ins_pipe( pipe_slow );
10836 %}