1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128) {
 1835          return false;
 1836        }
 1837        if ((size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1838          return false;
 1839        }
 1840        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1841          return false;
 1842        }
 1843        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1844          return false;
 1845        }
 1846        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1847          return false;
 1848        }
 1849        break;
 1850     case Op_MaskAll:
 1851       if (!VM_Version::supports_evex()) {
 1852         return false;
 1853       }
 1854       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1855         return false;
 1856       }
 1857       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1858         return false;
 1859       }
 1860       break;
 1861     case Op_VectorMaskCmp:
 1862       if (vlen < 2 || size_in_bits < 32) {
 1863         return false;
 1864       }
 1865       break;
 1866     case Op_CompressM:
 1867       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1868         return false;
 1869       }
 1870       break;
 1871     case Op_CompressV:
 1872     case Op_ExpandV:
 1873       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1874         return false;
 1875       }
 1876       if (size_in_bits < 128 ) {
 1877         return false;
 1878       }
 1879     case Op_VectorLongToMask:
 1880       if (UseAVX < 1) {
 1881         return false;
 1882       }
 1883       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1884         return false;
 1885       }
 1886       break;
 1887     case Op_SignumVD:
 1888     case Op_SignumVF:
 1889       if (UseAVX < 1) {
 1890         return false;
 1891       }
 1892       break;
 1893     case Op_PopCountVI:
 1894     case Op_PopCountVL: {
 1895         if (!is_pop_count_instr_target(bt) &&
 1896             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1897           return false;
 1898         }
 1899       }
 1900       break;
 1901     case Op_ReverseV:
 1902     case Op_ReverseBytesV:
 1903       if (UseAVX < 2) {
 1904         return false;
 1905       }
 1906       break;
 1907     case Op_CountTrailingZerosV:
 1908     case Op_CountLeadingZerosV:
 1909       if (UseAVX < 2) {
 1910         return false;
 1911       }
 1912       break;
 1913   }
 1914   return true;  // Per default match rules are supported.
 1915 }
 1916 
 1917 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1918   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1919   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1920   // of their non-masked counterpart with mask edge being the differentiator.
 1921   // This routine does a strict check on the existence of masked operation patterns
 1922   // by returning a default false value for all the other opcodes apart from the
 1923   // ones whose masked instruction patterns are defined in this file.
 1924   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1925     return false;
 1926   }
 1927 
 1928   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1929   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1930     return false;
 1931   }
 1932   switch(opcode) {
 1933     // Unary masked operations
 1934     case Op_AbsVB:
 1935     case Op_AbsVS:
 1936       if(!VM_Version::supports_avx512bw()) {
 1937         return false;  // Implementation limitation
 1938       }
 1939     case Op_AbsVI:
 1940     case Op_AbsVL:
 1941       return true;
 1942 
 1943     // Ternary masked operations
 1944     case Op_FmaVF:
 1945     case Op_FmaVD:
 1946       return true;
 1947 
 1948     case Op_MacroLogicV:
 1949       if(bt != T_INT && bt != T_LONG) {
 1950         return false;
 1951       }
 1952       return true;
 1953 
 1954     // Binary masked operations
 1955     case Op_AddVB:
 1956     case Op_AddVS:
 1957     case Op_SubVB:
 1958     case Op_SubVS:
 1959     case Op_MulVS:
 1960     case Op_LShiftVS:
 1961     case Op_RShiftVS:
 1962     case Op_URShiftVS:
 1963       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1964       if (!VM_Version::supports_avx512bw()) {
 1965         return false;  // Implementation limitation
 1966       }
 1967       return true;
 1968 
 1969     case Op_MulVL:
 1970       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1971       if (!VM_Version::supports_avx512dq()) {
 1972         return false;  // Implementation limitation
 1973       }
 1974       return true;
 1975 
 1976     case Op_AndV:
 1977     case Op_OrV:
 1978     case Op_XorV:
 1979     case Op_RotateRightV:
 1980     case Op_RotateLeftV:
 1981       if (bt != T_INT && bt != T_LONG) {
 1982         return false; // Implementation limitation
 1983       }
 1984       return true;
 1985 
 1986     case Op_VectorLoadMask:
 1987       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1988       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1989         return false;
 1990       }
 1991       return true;
 1992 
 1993     case Op_AddVI:
 1994     case Op_AddVL:
 1995     case Op_AddVF:
 1996     case Op_AddVD:
 1997     case Op_SubVI:
 1998     case Op_SubVL:
 1999     case Op_SubVF:
 2000     case Op_SubVD:
 2001     case Op_MulVI:
 2002     case Op_MulVF:
 2003     case Op_MulVD:
 2004     case Op_DivVF:
 2005     case Op_DivVD:
 2006     case Op_SqrtVF:
 2007     case Op_SqrtVD:
 2008     case Op_LShiftVI:
 2009     case Op_LShiftVL:
 2010     case Op_RShiftVI:
 2011     case Op_RShiftVL:
 2012     case Op_URShiftVI:
 2013     case Op_URShiftVL:
 2014     case Op_LoadVectorMasked:
 2015     case Op_StoreVectorMasked:
 2016     case Op_LoadVectorGatherMasked:
 2017     case Op_StoreVectorScatterMasked:
 2018       return true;
 2019 
 2020     case Op_UMinV:
 2021     case Op_UMaxV:
 2022       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2023         return false;
 2024       } // fallthrough
 2025     case Op_MaxV:
 2026     case Op_MinV:
 2027       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2028         return false; // Implementation limitation
 2029       }
 2030       if (is_floating_point_type(bt) && !VM_Version::supports_avx10_2()) {
 2031         return false; // Implementation limitation
 2032       }
 2033       return true;
 2034     case Op_SaturatingAddV:
 2035     case Op_SaturatingSubV:
 2036       if (!is_subword_type(bt)) {
 2037         return false;
 2038       }
 2039       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2040         return false; // Implementation limitation
 2041       }
 2042       return true;
 2043 
 2044     case Op_VectorMaskCmp:
 2045       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2046         return false; // Implementation limitation
 2047       }
 2048       return true;
 2049 
 2050     case Op_VectorRearrange:
 2051       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2052         return false; // Implementation limitation
 2053       }
 2054       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2055         return false; // Implementation limitation
 2056       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2057         return false; // Implementation limitation
 2058       }
 2059       return true;
 2060 
 2061     // Binary Logical operations
 2062     case Op_AndVMask:
 2063     case Op_OrVMask:
 2064     case Op_XorVMask:
 2065       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2066         return false; // Implementation limitation
 2067       }
 2068       return true;
 2069 
 2070     case Op_PopCountVI:
 2071     case Op_PopCountVL:
 2072       if (!is_pop_count_instr_target(bt)) {
 2073         return false;
 2074       }
 2075       return true;
 2076 
 2077     case Op_MaskAll:
 2078       return true;
 2079 
 2080     case Op_CountLeadingZerosV:
 2081       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2082         return true;
 2083       }
 2084     default:
 2085       return false;
 2086   }
 2087 }
 2088 
 2089 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2090   return false;
 2091 }
 2092 
 2093 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2094 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2095   switch (elem_bt) {
 2096     case T_BYTE:  return false;
 2097     case T_SHORT: return !VM_Version::supports_avx512bw();
 2098     case T_INT:   return !VM_Version::supports_avx();
 2099     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2100     default:
 2101       ShouldNotReachHere();
 2102       return false;
 2103   }
 2104 }
 2105 
 2106 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2107   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2108   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2109   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2110       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2111     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2112     return new legVecZOper();
 2113   }
 2114   if (legacy) {
 2115     switch (ideal_reg) {
 2116       case Op_VecS: return new legVecSOper();
 2117       case Op_VecD: return new legVecDOper();
 2118       case Op_VecX: return new legVecXOper();
 2119       case Op_VecY: return new legVecYOper();
 2120       case Op_VecZ: return new legVecZOper();
 2121     }
 2122   } else {
 2123     switch (ideal_reg) {
 2124       case Op_VecS: return new vecSOper();
 2125       case Op_VecD: return new vecDOper();
 2126       case Op_VecX: return new vecXOper();
 2127       case Op_VecY: return new vecYOper();
 2128       case Op_VecZ: return new vecZOper();
 2129     }
 2130   }
 2131   ShouldNotReachHere();
 2132   return nullptr;
 2133 }
 2134 
 2135 bool Matcher::is_reg2reg_move(MachNode* m) {
 2136   switch (m->rule()) {
 2137     case MoveVec2Leg_rule:
 2138     case MoveLeg2Vec_rule:
 2139     case MoveF2VL_rule:
 2140     case MoveF2LEG_rule:
 2141     case MoveVL2F_rule:
 2142     case MoveLEG2F_rule:
 2143     case MoveD2VL_rule:
 2144     case MoveD2LEG_rule:
 2145     case MoveVL2D_rule:
 2146     case MoveLEG2D_rule:
 2147       return true;
 2148     default:
 2149       return false;
 2150   }
 2151 }
 2152 
 2153 bool Matcher::is_generic_vector(MachOper* opnd) {
 2154   switch (opnd->opcode()) {
 2155     case VEC:
 2156     case LEGVEC:
 2157       return true;
 2158     default:
 2159       return false;
 2160   }
 2161 }
 2162 
 2163 //------------------------------------------------------------------------
 2164 
 2165 const RegMask* Matcher::predicate_reg_mask(void) {
 2166   return &_VECTMASK_REG_mask;
 2167 }
 2168 
 2169 // Max vector size in bytes. 0 if not supported.
 2170 int Matcher::vector_width_in_bytes(BasicType bt) {
 2171   assert(is_java_primitive(bt), "only primitive type vectors");
 2172   // SSE2 supports 128bit vectors for all types.
 2173   // AVX2 supports 256bit vectors for all types.
 2174   // AVX2/EVEX supports 512bit vectors for all types.
 2175   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2176   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2177   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2178     size = (UseAVX > 2) ? 64 : 32;
 2179   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2180     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2181   // Use flag to limit vector size.
 2182   size = MIN2(size,(int)MaxVectorSize);
 2183   // Minimum 2 values in vector (or 4 for bytes).
 2184   switch (bt) {
 2185   case T_DOUBLE:
 2186   case T_LONG:
 2187     if (size < 16) return 0;
 2188     break;
 2189   case T_FLOAT:
 2190   case T_INT:
 2191     if (size < 8) return 0;
 2192     break;
 2193   case T_BOOLEAN:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_CHAR:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_BYTE:
 2200     if (size < 4) return 0;
 2201     break;
 2202   case T_SHORT:
 2203     if (size < 4) return 0;
 2204     break;
 2205   default:
 2206     ShouldNotReachHere();
 2207   }
 2208   return size;
 2209 }
 2210 
 2211 // Limits on vector size (number of elements) loaded into vector.
 2212 int Matcher::max_vector_size(const BasicType bt) {
 2213   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2214 }
 2215 int Matcher::min_vector_size(const BasicType bt) {
 2216   int max_size = max_vector_size(bt);
 2217   // Min size which can be loaded into vector is 4 bytes.
 2218   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2219   // Support for calling svml double64 vectors
 2220   if (bt == T_DOUBLE) {
 2221     size = 1;
 2222   }
 2223   return MIN2(size,max_size);
 2224 }
 2225 
 2226 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2227   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2228   // by default on Cascade Lake
 2229   if (VM_Version::is_default_intel_cascade_lake()) {
 2230     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2231   }
 2232   return Matcher::max_vector_size(bt);
 2233 }
 2234 
 2235 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2236   return -1;
 2237 }
 2238 
 2239 // Vector ideal reg corresponding to specified size in bytes
 2240 uint Matcher::vector_ideal_reg(int size) {
 2241   assert(MaxVectorSize >= size, "");
 2242   switch(size) {
 2243     case  4: return Op_VecS;
 2244     case  8: return Op_VecD;
 2245     case 16: return Op_VecX;
 2246     case 32: return Op_VecY;
 2247     case 64: return Op_VecZ;
 2248   }
 2249   ShouldNotReachHere();
 2250   return 0;
 2251 }
 2252 
 2253 // Check for shift by small constant as well
 2254 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2255   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2256       shift->in(2)->get_int() <= 3 &&
 2257       // Are there other uses besides address expressions?
 2258       !matcher->is_visited(shift)) {
 2259     address_visited.set(shift->_idx); // Flag as address_visited
 2260     mstack.push(shift->in(2), Matcher::Visit);
 2261     Node *conv = shift->in(1);
 2262     // Allow Matcher to match the rule which bypass
 2263     // ConvI2L operation for an array index on LP64
 2264     // if the index value is positive.
 2265     if (conv->Opcode() == Op_ConvI2L &&
 2266         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2267         // Are there other uses besides address expressions?
 2268         !matcher->is_visited(conv)) {
 2269       address_visited.set(conv->_idx); // Flag as address_visited
 2270       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2271     } else {
 2272       mstack.push(conv, Matcher::Pre_Visit);
 2273     }
 2274     return true;
 2275   }
 2276   return false;
 2277 }
 2278 
 2279 // This function identifies sub-graphs in which a 'load' node is
 2280 // input to two different nodes, and such that it can be matched
 2281 // with BMI instructions like blsi, blsr, etc.
 2282 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2283 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2284 // refers to the same node.
 2285 //
 2286 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2287 // This is a temporary solution until we make DAGs expressible in ADL.
 2288 template<typename ConType>
 2289 class FusedPatternMatcher {
 2290   Node* _op1_node;
 2291   Node* _mop_node;
 2292   int _con_op;
 2293 
 2294   static int match_next(Node* n, int next_op, int next_op_idx) {
 2295     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2296       return -1;
 2297     }
 2298 
 2299     if (next_op_idx == -1) { // n is commutative, try rotations
 2300       if (n->in(1)->Opcode() == next_op) {
 2301         return 1;
 2302       } else if (n->in(2)->Opcode() == next_op) {
 2303         return 2;
 2304       }
 2305     } else {
 2306       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2307       if (n->in(next_op_idx)->Opcode() == next_op) {
 2308         return next_op_idx;
 2309       }
 2310     }
 2311     return -1;
 2312   }
 2313 
 2314  public:
 2315   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2316     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2317 
 2318   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2319              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2320              typename ConType::NativeType con_value) {
 2321     if (_op1_node->Opcode() != op1) {
 2322       return false;
 2323     }
 2324     if (_mop_node->outcnt() > 2) {
 2325       return false;
 2326     }
 2327     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2328     if (op1_op2_idx == -1) {
 2329       return false;
 2330     }
 2331     // Memory operation must be the other edge
 2332     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2333 
 2334     // Check that the mop node is really what we want
 2335     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2336       Node* op2_node = _op1_node->in(op1_op2_idx);
 2337       if (op2_node->outcnt() > 1) {
 2338         return false;
 2339       }
 2340       assert(op2_node->Opcode() == op2, "Should be");
 2341       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2342       if (op2_con_idx == -1) {
 2343         return false;
 2344       }
 2345       // Memory operation must be the other edge
 2346       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2347       // Check that the memory operation is the same node
 2348       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2349         // Now check the constant
 2350         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2351         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2352           return true;
 2353         }
 2354       }
 2355     }
 2356     return false;
 2357   }
 2358 };
 2359 
 2360 static bool is_bmi_pattern(Node* n, Node* m) {
 2361   assert(UseBMI1Instructions, "sanity");
 2362   if (n != nullptr && m != nullptr) {
 2363     if (m->Opcode() == Op_LoadI) {
 2364       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2365       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2366              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2367              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2368     } else if (m->Opcode() == Op_LoadL) {
 2369       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2370       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2371              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2372              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2373     }
 2374   }
 2375   return false;
 2376 }
 2377 
 2378 // Should the matcher clone input 'm' of node 'n'?
 2379 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2380   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2381   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2382     mstack.push(m, Visit);
 2383     return true;
 2384   }
 2385   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2386     mstack.push(m, Visit);           // m = ShiftCntV
 2387     return true;
 2388   }
 2389   if (is_encode_and_store_pattern(n, m)) {
 2390     mstack.push(m, Visit);
 2391     return true;
 2392   }
 2393   return false;
 2394 }
 2395 
 2396 // Should the Matcher clone shifts on addressing modes, expecting them
 2397 // to be subsumed into complex addressing expressions or compute them
 2398 // into registers?
 2399 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2400   Node *off = m->in(AddPNode::Offset);
 2401   if (off->is_Con()) {
 2402     address_visited.test_set(m->_idx); // Flag as address_visited
 2403     Node *adr = m->in(AddPNode::Address);
 2404 
 2405     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2406     // AtomicAdd is not an addressing expression.
 2407     // Cheap to find it by looking for screwy base.
 2408     if (adr->is_AddP() &&
 2409         !adr->in(AddPNode::Base)->is_top() &&
 2410         !adr->in(AddPNode::Offset)->is_Con() &&
 2411         off->get_long() == (int) (off->get_long()) && // immL32
 2412         // Are there other uses besides address expressions?
 2413         !is_visited(adr)) {
 2414       address_visited.set(adr->_idx); // Flag as address_visited
 2415       Node *shift = adr->in(AddPNode::Offset);
 2416       if (!clone_shift(shift, this, mstack, address_visited)) {
 2417         mstack.push(shift, Pre_Visit);
 2418       }
 2419       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2420       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2421     } else {
 2422       mstack.push(adr, Pre_Visit);
 2423     }
 2424 
 2425     // Clone X+offset as it also folds into most addressing expressions
 2426     mstack.push(off, Visit);
 2427     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2428     return true;
 2429   } else if (clone_shift(off, this, mstack, address_visited)) {
 2430     address_visited.test_set(m->_idx); // Flag as address_visited
 2431     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2432     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2433     return true;
 2434   }
 2435   return false;
 2436 }
 2437 
 2438 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2439   switch (bt) {
 2440     case BoolTest::eq:
 2441       return Assembler::eq;
 2442     case BoolTest::ne:
 2443       return Assembler::neq;
 2444     case BoolTest::le:
 2445     case BoolTest::ule:
 2446       return Assembler::le;
 2447     case BoolTest::ge:
 2448     case BoolTest::uge:
 2449       return Assembler::nlt;
 2450     case BoolTest::lt:
 2451     case BoolTest::ult:
 2452       return Assembler::lt;
 2453     case BoolTest::gt:
 2454     case BoolTest::ugt:
 2455       return Assembler::nle;
 2456     default : ShouldNotReachHere(); return Assembler::_false;
 2457   }
 2458 }
 2459 
 2460 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2461   switch (bt) {
 2462   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2463   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2464   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2465   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2466   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2467   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2468   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2469   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2470   }
 2471 }
 2472 
 2473 // Helper methods for MachSpillCopyNode::implementation().
 2474 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2475                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2476   assert(ireg == Op_VecS || // 32bit vector
 2477          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2478           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2479          "no non-adjacent vector moves" );
 2480   if (masm) {
 2481     switch (ireg) {
 2482     case Op_VecS: // copy whole register
 2483     case Op_VecD:
 2484     case Op_VecX:
 2485       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2486         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2487       } else {
 2488         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2489      }
 2490       break;
 2491     case Op_VecY:
 2492       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2493         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2494       } else {
 2495         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2496      }
 2497       break;
 2498     case Op_VecZ:
 2499       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2500       break;
 2501     default:
 2502       ShouldNotReachHere();
 2503     }
 2504 #ifndef PRODUCT
 2505   } else {
 2506     switch (ireg) {
 2507     case Op_VecS:
 2508     case Op_VecD:
 2509     case Op_VecX:
 2510       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2511       break;
 2512     case Op_VecY:
 2513     case Op_VecZ:
 2514       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2515       break;
 2516     default:
 2517       ShouldNotReachHere();
 2518     }
 2519 #endif
 2520   }
 2521 }
 2522 
 2523 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2524                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2525   if (masm) {
 2526     if (is_load) {
 2527       switch (ireg) {
 2528       case Op_VecS:
 2529         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecD:
 2532         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2533         break;
 2534       case Op_VecX:
 2535         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2536           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2537         } else {
 2538           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2539           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2540         }
 2541         break;
 2542       case Op_VecY:
 2543         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2544           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2545         } else {
 2546           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2547           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2548         }
 2549         break;
 2550       case Op_VecZ:
 2551         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2552         break;
 2553       default:
 2554         ShouldNotReachHere();
 2555       }
 2556     } else { // store
 2557       switch (ireg) {
 2558       case Op_VecS:
 2559         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecD:
 2562         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2563         break;
 2564       case Op_VecX:
 2565         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2566           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2567         }
 2568         else {
 2569           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2570         }
 2571         break;
 2572       case Op_VecY:
 2573         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2574           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2575         }
 2576         else {
 2577           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2578         }
 2579         break;
 2580       case Op_VecZ:
 2581         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2582         break;
 2583       default:
 2584         ShouldNotReachHere();
 2585       }
 2586     }
 2587 #ifndef PRODUCT
 2588   } else {
 2589     if (is_load) {
 2590       switch (ireg) {
 2591       case Op_VecS:
 2592         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594       case Op_VecD:
 2595         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597        case Op_VecX:
 2598         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2599         break;
 2600       case Op_VecY:
 2601       case Op_VecZ:
 2602         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2603         break;
 2604       default:
 2605         ShouldNotReachHere();
 2606       }
 2607     } else { // store
 2608       switch (ireg) {
 2609       case Op_VecS:
 2610         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612       case Op_VecD:
 2613         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615        case Op_VecX:
 2616         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2617         break;
 2618       case Op_VecY:
 2619       case Op_VecZ:
 2620         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2621         break;
 2622       default:
 2623         ShouldNotReachHere();
 2624       }
 2625     }
 2626 #endif
 2627   }
 2628 }
 2629 
 2630 template <class T>
 2631 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2632   int size = type2aelembytes(bt) * len;
 2633   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2634   for (int i = 0; i < len; i++) {
 2635     int offset = i * type2aelembytes(bt);
 2636     switch (bt) {
 2637       case T_BYTE: val->at(i) = con; break;
 2638       case T_SHORT: {
 2639         jshort c = con;
 2640         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2641         break;
 2642       }
 2643       case T_INT: {
 2644         jint c = con;
 2645         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2646         break;
 2647       }
 2648       case T_LONG: {
 2649         jlong c = con;
 2650         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2651         break;
 2652       }
 2653       case T_FLOAT: {
 2654         jfloat c = con;
 2655         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2656         break;
 2657       }
 2658       case T_DOUBLE: {
 2659         jdouble c = con;
 2660         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2661         break;
 2662       }
 2663       default: assert(false, "%s", type2name(bt));
 2664     }
 2665   }
 2666   return val;
 2667 }
 2668 
 2669 static inline jlong high_bit_set(BasicType bt) {
 2670   switch (bt) {
 2671     case T_BYTE:  return 0x8080808080808080;
 2672     case T_SHORT: return 0x8000800080008000;
 2673     case T_INT:   return 0x8000000080000000;
 2674     case T_LONG:  return 0x8000000000000000;
 2675     default:
 2676       ShouldNotReachHere();
 2677       return 0;
 2678   }
 2679 }
 2680 
 2681 #ifndef PRODUCT
 2682   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2683     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2684   }
 2685 #endif
 2686 
 2687   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2688     __ nop(_count);
 2689   }
 2690 
 2691   uint MachNopNode::size(PhaseRegAlloc*) const {
 2692     return _count;
 2693   }
 2694 
 2695 #ifndef PRODUCT
 2696   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2697     st->print("# breakpoint");
 2698   }
 2699 #endif
 2700 
 2701   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2702     __ int3();
 2703   }
 2704 
 2705   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2706     return MachNode::size(ra_);
 2707   }
 2708 
 2709 %}
 2710 
 2711 encode %{
 2712 
 2713   enc_class call_epilog %{
 2714     if (VerifyStackAtCalls) {
 2715       // Check that stack depth is unchanged: find majik cookie on stack
 2716       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2717       Label L;
 2718       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2719       __ jccb(Assembler::equal, L);
 2720       // Die if stack mismatch
 2721       __ int3();
 2722       __ bind(L);
 2723     }
 2724     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic() && _method->return_type()->is_loaded()) {
 2725       // The last return value is not set by the callee but used to pass the null marker to compiled code.
 2726       // Search for the corresponding projection, get the register and emit code that initialized it.
 2727       uint con = (tf()->range_cc()->cnt() - 1);
 2728       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2729         ProjNode* proj = fast_out(i)->as_Proj();
 2730         if (proj->_con == con) {
 2731           // Set null marker if rax is non-null (a non-null value is returned buffered or scalarized)
 2732           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2733           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2734           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2735           __ testq(rax, rax);
 2736           __ setb(Assembler::notZero, toReg);
 2737           __ movzbl(toReg, toReg);
 2738           if (reg->is_stack()) {
 2739             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2740             __ movq(Address(rsp, st_off), toReg);
 2741           }
 2742           break;
 2743         }
 2744       }
 2745       if (return_value_is_used()) {
 2746         // An inline type is returned as fields in multiple registers.
 2747         // Rax either contains an oop if the inline type is buffered or a pointer
 2748         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2749         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2750         // rax &= (rax & 1) - 1
 2751         __ movptr(rscratch1, rax);
 2752         __ andptr(rscratch1, 0x1);
 2753         __ subptr(rscratch1, 0x1);
 2754         __ andptr(rax, rscratch1);
 2755       }
 2756     }
 2757   %}
 2758 
 2759 %}
 2760 
 2761 // Operands for bound floating pointer register arguments
 2762 operand rxmm0() %{
 2763   constraint(ALLOC_IN_RC(xmm0_reg));
 2764   match(VecX);
 2765   format%{%}
 2766   interface(REG_INTER);
 2767 %}
 2768 
 2769 //----------OPERANDS-----------------------------------------------------------
 2770 // Operand definitions must precede instruction definitions for correct parsing
 2771 // in the ADLC because operands constitute user defined types which are used in
 2772 // instruction definitions.
 2773 
 2774 // Vectors
 2775 
 2776 // Dummy generic vector class. Should be used for all vector operands.
 2777 // Replaced with vec[SDXYZ] during post-selection pass.
 2778 operand vec() %{
 2779   constraint(ALLOC_IN_RC(dynamic));
 2780   match(VecX);
 2781   match(VecY);
 2782   match(VecZ);
 2783   match(VecS);
 2784   match(VecD);
 2785 
 2786   format %{ %}
 2787   interface(REG_INTER);
 2788 %}
 2789 
 2790 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2791 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2792 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2793 // runtime code generation via reg_class_dynamic.
 2794 operand legVec() %{
 2795   constraint(ALLOC_IN_RC(dynamic));
 2796   match(VecX);
 2797   match(VecY);
 2798   match(VecZ);
 2799   match(VecS);
 2800   match(VecD);
 2801 
 2802   format %{ %}
 2803   interface(REG_INTER);
 2804 %}
 2805 
 2806 // Replaces vec during post-selection cleanup. See above.
 2807 operand vecS() %{
 2808   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2809   match(VecS);
 2810 
 2811   format %{ %}
 2812   interface(REG_INTER);
 2813 %}
 2814 
 2815 // Replaces legVec during post-selection cleanup. See above.
 2816 operand legVecS() %{
 2817   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2818   match(VecS);
 2819 
 2820   format %{ %}
 2821   interface(REG_INTER);
 2822 %}
 2823 
 2824 // Replaces vec during post-selection cleanup. See above.
 2825 operand vecD() %{
 2826   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2827   match(VecD);
 2828 
 2829   format %{ %}
 2830   interface(REG_INTER);
 2831 %}
 2832 
 2833 // Replaces legVec during post-selection cleanup. See above.
 2834 operand legVecD() %{
 2835   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2836   match(VecD);
 2837 
 2838   format %{ %}
 2839   interface(REG_INTER);
 2840 %}
 2841 
 2842 // Replaces vec during post-selection cleanup. See above.
 2843 operand vecX() %{
 2844   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2845   match(VecX);
 2846 
 2847   format %{ %}
 2848   interface(REG_INTER);
 2849 %}
 2850 
 2851 // Replaces legVec during post-selection cleanup. See above.
 2852 operand legVecX() %{
 2853   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2854   match(VecX);
 2855 
 2856   format %{ %}
 2857   interface(REG_INTER);
 2858 %}
 2859 
 2860 // Replaces vec during post-selection cleanup. See above.
 2861 operand vecY() %{
 2862   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2863   match(VecY);
 2864 
 2865   format %{ %}
 2866   interface(REG_INTER);
 2867 %}
 2868 
 2869 // Replaces legVec during post-selection cleanup. See above.
 2870 operand legVecY() %{
 2871   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2872   match(VecY);
 2873 
 2874   format %{ %}
 2875   interface(REG_INTER);
 2876 %}
 2877 
 2878 // Replaces vec during post-selection cleanup. See above.
 2879 operand vecZ() %{
 2880   constraint(ALLOC_IN_RC(vectorz_reg));
 2881   match(VecZ);
 2882 
 2883   format %{ %}
 2884   interface(REG_INTER);
 2885 %}
 2886 
 2887 // Replaces legVec during post-selection cleanup. See above.
 2888 operand legVecZ() %{
 2889   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2890   match(VecZ);
 2891 
 2892   format %{ %}
 2893   interface(REG_INTER);
 2894 %}
 2895 
 2896 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2897 
 2898 // ============================================================================
 2899 
 2900 instruct ShouldNotReachHere() %{
 2901   match(Halt);
 2902   format %{ "stop\t# ShouldNotReachHere" %}
 2903   ins_encode %{
 2904     if (is_reachable()) {
 2905       const char* str = __ code_string(_halt_reason);
 2906       __ stop(str);
 2907     }
 2908   %}
 2909   ins_pipe(pipe_slow);
 2910 %}
 2911 
 2912 // ============================================================================
 2913 
 2914 instruct addF_reg(regF dst, regF src) %{
 2915   predicate(UseAVX == 0);
 2916   match(Set dst (AddF dst src));
 2917 
 2918   format %{ "addss   $dst, $src" %}
 2919   ins_cost(150);
 2920   ins_encode %{
 2921     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2922   %}
 2923   ins_pipe(pipe_slow);
 2924 %}
 2925 
 2926 instruct addF_mem(regF dst, memory src) %{
 2927   predicate(UseAVX == 0);
 2928   match(Set dst (AddF dst (LoadF src)));
 2929 
 2930   format %{ "addss   $dst, $src" %}
 2931   ins_cost(150);
 2932   ins_encode %{
 2933     __ addss($dst$$XMMRegister, $src$$Address);
 2934   %}
 2935   ins_pipe(pipe_slow);
 2936 %}
 2937 
 2938 instruct addF_imm(regF dst, immF con) %{
 2939   predicate(UseAVX == 0);
 2940   match(Set dst (AddF dst con));
 2941   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2942   ins_cost(150);
 2943   ins_encode %{
 2944     __ addss($dst$$XMMRegister, $constantaddress($con));
 2945   %}
 2946   ins_pipe(pipe_slow);
 2947 %}
 2948 
 2949 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2950   predicate(UseAVX > 0);
 2951   match(Set dst (AddF src1 src2));
 2952 
 2953   format %{ "vaddss  $dst, $src1, $src2" %}
 2954   ins_cost(150);
 2955   ins_encode %{
 2956     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2957   %}
 2958   ins_pipe(pipe_slow);
 2959 %}
 2960 
 2961 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2962   predicate(UseAVX > 0);
 2963   match(Set dst (AddF src1 (LoadF src2)));
 2964 
 2965   format %{ "vaddss  $dst, $src1, $src2" %}
 2966   ins_cost(150);
 2967   ins_encode %{
 2968     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2969   %}
 2970   ins_pipe(pipe_slow);
 2971 %}
 2972 
 2973 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2974   predicate(UseAVX > 0);
 2975   match(Set dst (AddF src con));
 2976 
 2977   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2978   ins_cost(150);
 2979   ins_encode %{
 2980     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2981   %}
 2982   ins_pipe(pipe_slow);
 2983 %}
 2984 
 2985 instruct addD_reg(regD dst, regD src) %{
 2986   predicate(UseAVX == 0);
 2987   match(Set dst (AddD dst src));
 2988 
 2989   format %{ "addsd   $dst, $src" %}
 2990   ins_cost(150);
 2991   ins_encode %{
 2992     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2993   %}
 2994   ins_pipe(pipe_slow);
 2995 %}
 2996 
 2997 instruct addD_mem(regD dst, memory src) %{
 2998   predicate(UseAVX == 0);
 2999   match(Set dst (AddD dst (LoadD src)));
 3000 
 3001   format %{ "addsd   $dst, $src" %}
 3002   ins_cost(150);
 3003   ins_encode %{
 3004     __ addsd($dst$$XMMRegister, $src$$Address);
 3005   %}
 3006   ins_pipe(pipe_slow);
 3007 %}
 3008 
 3009 instruct addD_imm(regD dst, immD con) %{
 3010   predicate(UseAVX == 0);
 3011   match(Set dst (AddD dst con));
 3012   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3013   ins_cost(150);
 3014   ins_encode %{
 3015     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3016   %}
 3017   ins_pipe(pipe_slow);
 3018 %}
 3019 
 3020 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3021   predicate(UseAVX > 0);
 3022   match(Set dst (AddD src1 src2));
 3023 
 3024   format %{ "vaddsd  $dst, $src1, $src2" %}
 3025   ins_cost(150);
 3026   ins_encode %{
 3027     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3028   %}
 3029   ins_pipe(pipe_slow);
 3030 %}
 3031 
 3032 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3033   predicate(UseAVX > 0);
 3034   match(Set dst (AddD src1 (LoadD src2)));
 3035 
 3036   format %{ "vaddsd  $dst, $src1, $src2" %}
 3037   ins_cost(150);
 3038   ins_encode %{
 3039     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3040   %}
 3041   ins_pipe(pipe_slow);
 3042 %}
 3043 
 3044 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3045   predicate(UseAVX > 0);
 3046   match(Set dst (AddD src con));
 3047 
 3048   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3049   ins_cost(150);
 3050   ins_encode %{
 3051     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3052   %}
 3053   ins_pipe(pipe_slow);
 3054 %}
 3055 
 3056 instruct subF_reg(regF dst, regF src) %{
 3057   predicate(UseAVX == 0);
 3058   match(Set dst (SubF dst src));
 3059 
 3060   format %{ "subss   $dst, $src" %}
 3061   ins_cost(150);
 3062   ins_encode %{
 3063     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3064   %}
 3065   ins_pipe(pipe_slow);
 3066 %}
 3067 
 3068 instruct subF_mem(regF dst, memory src) %{
 3069   predicate(UseAVX == 0);
 3070   match(Set dst (SubF dst (LoadF src)));
 3071 
 3072   format %{ "subss   $dst, $src" %}
 3073   ins_cost(150);
 3074   ins_encode %{
 3075     __ subss($dst$$XMMRegister, $src$$Address);
 3076   %}
 3077   ins_pipe(pipe_slow);
 3078 %}
 3079 
 3080 instruct subF_imm(regF dst, immF con) %{
 3081   predicate(UseAVX == 0);
 3082   match(Set dst (SubF dst con));
 3083   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3084   ins_cost(150);
 3085   ins_encode %{
 3086     __ subss($dst$$XMMRegister, $constantaddress($con));
 3087   %}
 3088   ins_pipe(pipe_slow);
 3089 %}
 3090 
 3091 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3092   predicate(UseAVX > 0);
 3093   match(Set dst (SubF src1 src2));
 3094 
 3095   format %{ "vsubss  $dst, $src1, $src2" %}
 3096   ins_cost(150);
 3097   ins_encode %{
 3098     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3099   %}
 3100   ins_pipe(pipe_slow);
 3101 %}
 3102 
 3103 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3104   predicate(UseAVX > 0);
 3105   match(Set dst (SubF src1 (LoadF src2)));
 3106 
 3107   format %{ "vsubss  $dst, $src1, $src2" %}
 3108   ins_cost(150);
 3109   ins_encode %{
 3110     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3111   %}
 3112   ins_pipe(pipe_slow);
 3113 %}
 3114 
 3115 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3116   predicate(UseAVX > 0);
 3117   match(Set dst (SubF src con));
 3118 
 3119   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3120   ins_cost(150);
 3121   ins_encode %{
 3122     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3123   %}
 3124   ins_pipe(pipe_slow);
 3125 %}
 3126 
 3127 instruct subD_reg(regD dst, regD src) %{
 3128   predicate(UseAVX == 0);
 3129   match(Set dst (SubD dst src));
 3130 
 3131   format %{ "subsd   $dst, $src" %}
 3132   ins_cost(150);
 3133   ins_encode %{
 3134     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3135   %}
 3136   ins_pipe(pipe_slow);
 3137 %}
 3138 
 3139 instruct subD_mem(regD dst, memory src) %{
 3140   predicate(UseAVX == 0);
 3141   match(Set dst (SubD dst (LoadD src)));
 3142 
 3143   format %{ "subsd   $dst, $src" %}
 3144   ins_cost(150);
 3145   ins_encode %{
 3146     __ subsd($dst$$XMMRegister, $src$$Address);
 3147   %}
 3148   ins_pipe(pipe_slow);
 3149 %}
 3150 
 3151 instruct subD_imm(regD dst, immD con) %{
 3152   predicate(UseAVX == 0);
 3153   match(Set dst (SubD dst con));
 3154   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3155   ins_cost(150);
 3156   ins_encode %{
 3157     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3158   %}
 3159   ins_pipe(pipe_slow);
 3160 %}
 3161 
 3162 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3163   predicate(UseAVX > 0);
 3164   match(Set dst (SubD src1 src2));
 3165 
 3166   format %{ "vsubsd  $dst, $src1, $src2" %}
 3167   ins_cost(150);
 3168   ins_encode %{
 3169     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3170   %}
 3171   ins_pipe(pipe_slow);
 3172 %}
 3173 
 3174 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3175   predicate(UseAVX > 0);
 3176   match(Set dst (SubD src1 (LoadD src2)));
 3177 
 3178   format %{ "vsubsd  $dst, $src1, $src2" %}
 3179   ins_cost(150);
 3180   ins_encode %{
 3181     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3182   %}
 3183   ins_pipe(pipe_slow);
 3184 %}
 3185 
 3186 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3187   predicate(UseAVX > 0);
 3188   match(Set dst (SubD src con));
 3189 
 3190   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3191   ins_cost(150);
 3192   ins_encode %{
 3193     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3194   %}
 3195   ins_pipe(pipe_slow);
 3196 %}
 3197 
 3198 instruct mulF_reg(regF dst, regF src) %{
 3199   predicate(UseAVX == 0);
 3200   match(Set dst (MulF dst src));
 3201 
 3202   format %{ "mulss   $dst, $src" %}
 3203   ins_cost(150);
 3204   ins_encode %{
 3205     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3206   %}
 3207   ins_pipe(pipe_slow);
 3208 %}
 3209 
 3210 instruct mulF_mem(regF dst, memory src) %{
 3211   predicate(UseAVX == 0);
 3212   match(Set dst (MulF dst (LoadF src)));
 3213 
 3214   format %{ "mulss   $dst, $src" %}
 3215   ins_cost(150);
 3216   ins_encode %{
 3217     __ mulss($dst$$XMMRegister, $src$$Address);
 3218   %}
 3219   ins_pipe(pipe_slow);
 3220 %}
 3221 
 3222 instruct mulF_imm(regF dst, immF con) %{
 3223   predicate(UseAVX == 0);
 3224   match(Set dst (MulF dst con));
 3225   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3226   ins_cost(150);
 3227   ins_encode %{
 3228     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3229   %}
 3230   ins_pipe(pipe_slow);
 3231 %}
 3232 
 3233 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3234   predicate(UseAVX > 0);
 3235   match(Set dst (MulF src1 src2));
 3236 
 3237   format %{ "vmulss  $dst, $src1, $src2" %}
 3238   ins_cost(150);
 3239   ins_encode %{
 3240     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3241   %}
 3242   ins_pipe(pipe_slow);
 3243 %}
 3244 
 3245 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3246   predicate(UseAVX > 0);
 3247   match(Set dst (MulF src1 (LoadF src2)));
 3248 
 3249   format %{ "vmulss  $dst, $src1, $src2" %}
 3250   ins_cost(150);
 3251   ins_encode %{
 3252     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3253   %}
 3254   ins_pipe(pipe_slow);
 3255 %}
 3256 
 3257 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3258   predicate(UseAVX > 0);
 3259   match(Set dst (MulF src con));
 3260 
 3261   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3262   ins_cost(150);
 3263   ins_encode %{
 3264     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3265   %}
 3266   ins_pipe(pipe_slow);
 3267 %}
 3268 
 3269 instruct mulD_reg(regD dst, regD src) %{
 3270   predicate(UseAVX == 0);
 3271   match(Set dst (MulD dst src));
 3272 
 3273   format %{ "mulsd   $dst, $src" %}
 3274   ins_cost(150);
 3275   ins_encode %{
 3276     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3277   %}
 3278   ins_pipe(pipe_slow);
 3279 %}
 3280 
 3281 instruct mulD_mem(regD dst, memory src) %{
 3282   predicate(UseAVX == 0);
 3283   match(Set dst (MulD dst (LoadD src)));
 3284 
 3285   format %{ "mulsd   $dst, $src" %}
 3286   ins_cost(150);
 3287   ins_encode %{
 3288     __ mulsd($dst$$XMMRegister, $src$$Address);
 3289   %}
 3290   ins_pipe(pipe_slow);
 3291 %}
 3292 
 3293 instruct mulD_imm(regD dst, immD con) %{
 3294   predicate(UseAVX == 0);
 3295   match(Set dst (MulD dst con));
 3296   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3297   ins_cost(150);
 3298   ins_encode %{
 3299     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3300   %}
 3301   ins_pipe(pipe_slow);
 3302 %}
 3303 
 3304 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3305   predicate(UseAVX > 0);
 3306   match(Set dst (MulD src1 src2));
 3307 
 3308   format %{ "vmulsd  $dst, $src1, $src2" %}
 3309   ins_cost(150);
 3310   ins_encode %{
 3311     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3312   %}
 3313   ins_pipe(pipe_slow);
 3314 %}
 3315 
 3316 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3317   predicate(UseAVX > 0);
 3318   match(Set dst (MulD src1 (LoadD src2)));
 3319 
 3320   format %{ "vmulsd  $dst, $src1, $src2" %}
 3321   ins_cost(150);
 3322   ins_encode %{
 3323     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3324   %}
 3325   ins_pipe(pipe_slow);
 3326 %}
 3327 
 3328 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3329   predicate(UseAVX > 0);
 3330   match(Set dst (MulD src con));
 3331 
 3332   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3333   ins_cost(150);
 3334   ins_encode %{
 3335     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3336   %}
 3337   ins_pipe(pipe_slow);
 3338 %}
 3339 
 3340 instruct divF_reg(regF dst, regF src) %{
 3341   predicate(UseAVX == 0);
 3342   match(Set dst (DivF dst src));
 3343 
 3344   format %{ "divss   $dst, $src" %}
 3345   ins_cost(150);
 3346   ins_encode %{
 3347     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3348   %}
 3349   ins_pipe(pipe_slow);
 3350 %}
 3351 
 3352 instruct divF_mem(regF dst, memory src) %{
 3353   predicate(UseAVX == 0);
 3354   match(Set dst (DivF dst (LoadF src)));
 3355 
 3356   format %{ "divss   $dst, $src" %}
 3357   ins_cost(150);
 3358   ins_encode %{
 3359     __ divss($dst$$XMMRegister, $src$$Address);
 3360   %}
 3361   ins_pipe(pipe_slow);
 3362 %}
 3363 
 3364 instruct divF_imm(regF dst, immF con) %{
 3365   predicate(UseAVX == 0);
 3366   match(Set dst (DivF dst con));
 3367   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3368   ins_cost(150);
 3369   ins_encode %{
 3370     __ divss($dst$$XMMRegister, $constantaddress($con));
 3371   %}
 3372   ins_pipe(pipe_slow);
 3373 %}
 3374 
 3375 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3376   predicate(UseAVX > 0);
 3377   match(Set dst (DivF src1 src2));
 3378 
 3379   format %{ "vdivss  $dst, $src1, $src2" %}
 3380   ins_cost(150);
 3381   ins_encode %{
 3382     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3383   %}
 3384   ins_pipe(pipe_slow);
 3385 %}
 3386 
 3387 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3388   predicate(UseAVX > 0);
 3389   match(Set dst (DivF src1 (LoadF src2)));
 3390 
 3391   format %{ "vdivss  $dst, $src1, $src2" %}
 3392   ins_cost(150);
 3393   ins_encode %{
 3394     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3395   %}
 3396   ins_pipe(pipe_slow);
 3397 %}
 3398 
 3399 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3400   predicate(UseAVX > 0);
 3401   match(Set dst (DivF src con));
 3402 
 3403   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3404   ins_cost(150);
 3405   ins_encode %{
 3406     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3407   %}
 3408   ins_pipe(pipe_slow);
 3409 %}
 3410 
 3411 instruct divD_reg(regD dst, regD src) %{
 3412   predicate(UseAVX == 0);
 3413   match(Set dst (DivD dst src));
 3414 
 3415   format %{ "divsd   $dst, $src" %}
 3416   ins_cost(150);
 3417   ins_encode %{
 3418     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3419   %}
 3420   ins_pipe(pipe_slow);
 3421 %}
 3422 
 3423 instruct divD_mem(regD dst, memory src) %{
 3424   predicate(UseAVX == 0);
 3425   match(Set dst (DivD dst (LoadD src)));
 3426 
 3427   format %{ "divsd   $dst, $src" %}
 3428   ins_cost(150);
 3429   ins_encode %{
 3430     __ divsd($dst$$XMMRegister, $src$$Address);
 3431   %}
 3432   ins_pipe(pipe_slow);
 3433 %}
 3434 
 3435 instruct divD_imm(regD dst, immD con) %{
 3436   predicate(UseAVX == 0);
 3437   match(Set dst (DivD dst con));
 3438   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3439   ins_cost(150);
 3440   ins_encode %{
 3441     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3442   %}
 3443   ins_pipe(pipe_slow);
 3444 %}
 3445 
 3446 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3447   predicate(UseAVX > 0);
 3448   match(Set dst (DivD src1 src2));
 3449 
 3450   format %{ "vdivsd  $dst, $src1, $src2" %}
 3451   ins_cost(150);
 3452   ins_encode %{
 3453     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3454   %}
 3455   ins_pipe(pipe_slow);
 3456 %}
 3457 
 3458 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3459   predicate(UseAVX > 0);
 3460   match(Set dst (DivD src1 (LoadD src2)));
 3461 
 3462   format %{ "vdivsd  $dst, $src1, $src2" %}
 3463   ins_cost(150);
 3464   ins_encode %{
 3465     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3466   %}
 3467   ins_pipe(pipe_slow);
 3468 %}
 3469 
 3470 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3471   predicate(UseAVX > 0);
 3472   match(Set dst (DivD src con));
 3473 
 3474   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3475   ins_cost(150);
 3476   ins_encode %{
 3477     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3478   %}
 3479   ins_pipe(pipe_slow);
 3480 %}
 3481 
 3482 instruct absF_reg(regF dst) %{
 3483   predicate(UseAVX == 0);
 3484   match(Set dst (AbsF dst));
 3485   ins_cost(150);
 3486   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3487   ins_encode %{
 3488     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3489   %}
 3490   ins_pipe(pipe_slow);
 3491 %}
 3492 
 3493 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3494   predicate(UseAVX > 0);
 3495   match(Set dst (AbsF src));
 3496   ins_cost(150);
 3497   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3498   ins_encode %{
 3499     int vlen_enc = Assembler::AVX_128bit;
 3500     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3501               ExternalAddress(float_signmask()), vlen_enc);
 3502   %}
 3503   ins_pipe(pipe_slow);
 3504 %}
 3505 
 3506 instruct absD_reg(regD dst) %{
 3507   predicate(UseAVX == 0);
 3508   match(Set dst (AbsD dst));
 3509   ins_cost(150);
 3510   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3511             "# abs double by sign masking" %}
 3512   ins_encode %{
 3513     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3514   %}
 3515   ins_pipe(pipe_slow);
 3516 %}
 3517 
 3518 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3519   predicate(UseAVX > 0);
 3520   match(Set dst (AbsD src));
 3521   ins_cost(150);
 3522   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3523             "# abs double by sign masking" %}
 3524   ins_encode %{
 3525     int vlen_enc = Assembler::AVX_128bit;
 3526     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3527               ExternalAddress(double_signmask()), vlen_enc);
 3528   %}
 3529   ins_pipe(pipe_slow);
 3530 %}
 3531 
 3532 instruct negF_reg(regF dst) %{
 3533   predicate(UseAVX == 0);
 3534   match(Set dst (NegF dst));
 3535   ins_cost(150);
 3536   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3537   ins_encode %{
 3538     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3539   %}
 3540   ins_pipe(pipe_slow);
 3541 %}
 3542 
 3543 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3544   predicate(UseAVX > 0);
 3545   match(Set dst (NegF src));
 3546   ins_cost(150);
 3547   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3548   ins_encode %{
 3549     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3550                  ExternalAddress(float_signflip()));
 3551   %}
 3552   ins_pipe(pipe_slow);
 3553 %}
 3554 
 3555 instruct negD_reg(regD dst) %{
 3556   predicate(UseAVX == 0);
 3557   match(Set dst (NegD dst));
 3558   ins_cost(150);
 3559   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3560             "# neg double by sign flipping" %}
 3561   ins_encode %{
 3562     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3563   %}
 3564   ins_pipe(pipe_slow);
 3565 %}
 3566 
 3567 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3568   predicate(UseAVX > 0);
 3569   match(Set dst (NegD src));
 3570   ins_cost(150);
 3571   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3572             "# neg double by sign flipping" %}
 3573   ins_encode %{
 3574     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3575                  ExternalAddress(double_signflip()));
 3576   %}
 3577   ins_pipe(pipe_slow);
 3578 %}
 3579 
 3580 // sqrtss instruction needs destination register to be pre initialized for best performance
 3581 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3582 instruct sqrtF_reg(regF dst) %{
 3583   match(Set dst (SqrtF dst));
 3584   format %{ "sqrtss  $dst, $dst" %}
 3585   ins_encode %{
 3586     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3587   %}
 3588   ins_pipe(pipe_slow);
 3589 %}
 3590 
 3591 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3592 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3593 instruct sqrtD_reg(regD dst) %{
 3594   match(Set dst (SqrtD dst));
 3595   format %{ "sqrtsd  $dst, $dst" %}
 3596   ins_encode %{
 3597     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3598   %}
 3599   ins_pipe(pipe_slow);
 3600 %}
 3601 
 3602 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3603   effect(TEMP tmp);
 3604   match(Set dst (ConvF2HF src));
 3605   ins_cost(125);
 3606   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3607   ins_encode %{
 3608     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3609   %}
 3610   ins_pipe( pipe_slow );
 3611 %}
 3612 
 3613 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3614   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3615   effect(TEMP ktmp, TEMP rtmp);
 3616   match(Set mem (StoreC mem (ConvF2HF src)));
 3617   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3618   ins_encode %{
 3619     __ movl($rtmp$$Register, 0x1);
 3620     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3621     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3622   %}
 3623   ins_pipe( pipe_slow );
 3624 %}
 3625 
 3626 instruct vconvF2HF(vec dst, vec src) %{
 3627   match(Set dst (VectorCastF2HF src));
 3628   format %{ "vector_conv_F2HF $dst $src" %}
 3629   ins_encode %{
 3630     int vlen_enc = vector_length_encoding(this, $src);
 3631     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3632   %}
 3633   ins_pipe( pipe_slow );
 3634 %}
 3635 
 3636 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3637   predicate(n->as_StoreVector()->memory_size() >= 16);
 3638   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3639   format %{ "vcvtps2ph $mem,$src" %}
 3640   ins_encode %{
 3641     int vlen_enc = vector_length_encoding(this, $src);
 3642     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3643   %}
 3644   ins_pipe( pipe_slow );
 3645 %}
 3646 
 3647 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3648   match(Set dst (ConvHF2F src));
 3649   format %{ "vcvtph2ps $dst,$src" %}
 3650   ins_encode %{
 3651     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3652   %}
 3653   ins_pipe( pipe_slow );
 3654 %}
 3655 
 3656 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3657   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3658   format %{ "vcvtph2ps $dst,$mem" %}
 3659   ins_encode %{
 3660     int vlen_enc = vector_length_encoding(this);
 3661     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3662   %}
 3663   ins_pipe( pipe_slow );
 3664 %}
 3665 
 3666 instruct vconvHF2F(vec dst, vec src) %{
 3667   match(Set dst (VectorCastHF2F src));
 3668   ins_cost(125);
 3669   format %{ "vector_conv_HF2F $dst,$src" %}
 3670   ins_encode %{
 3671     int vlen_enc = vector_length_encoding(this);
 3672     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3673   %}
 3674   ins_pipe( pipe_slow );
 3675 %}
 3676 
 3677 // ---------------------------------------- VectorReinterpret ------------------------------------
 3678 instruct reinterpret_mask(kReg dst) %{
 3679   predicate(n->bottom_type()->isa_vectmask() &&
 3680             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3681   match(Set dst (VectorReinterpret dst));
 3682   ins_cost(125);
 3683   format %{ "vector_reinterpret $dst\t!" %}
 3684   ins_encode %{
 3685     // empty
 3686   %}
 3687   ins_pipe( pipe_slow );
 3688 %}
 3689 
 3690 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3691   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3692             n->bottom_type()->isa_vectmask() &&
 3693             n->in(1)->bottom_type()->isa_vectmask() &&
 3694             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3695             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3696   match(Set dst (VectorReinterpret src));
 3697   effect(TEMP xtmp);
 3698   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3699   ins_encode %{
 3700      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3701      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3702      assert(src_sz == dst_sz , "src and dst size mismatch");
 3703      int vlen_enc = vector_length_encoding(src_sz);
 3704      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3705      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3706   %}
 3707   ins_pipe( pipe_slow );
 3708 %}
 3709 
 3710 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3711   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3712             n->bottom_type()->isa_vectmask() &&
 3713             n->in(1)->bottom_type()->isa_vectmask() &&
 3714             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3715              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3716             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3717   match(Set dst (VectorReinterpret src));
 3718   effect(TEMP xtmp);
 3719   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3720   ins_encode %{
 3721      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3722      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3723      assert(src_sz == dst_sz , "src and dst size mismatch");
 3724      int vlen_enc = vector_length_encoding(src_sz);
 3725      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3726      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3727   %}
 3728   ins_pipe( pipe_slow );
 3729 %}
 3730 
 3731 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3732   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3733             n->bottom_type()->isa_vectmask() &&
 3734             n->in(1)->bottom_type()->isa_vectmask() &&
 3735             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3736              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3737             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3738   match(Set dst (VectorReinterpret src));
 3739   effect(TEMP xtmp);
 3740   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3741   ins_encode %{
 3742      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3743      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3744      assert(src_sz == dst_sz , "src and dst size mismatch");
 3745      int vlen_enc = vector_length_encoding(src_sz);
 3746      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3747      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3748   %}
 3749   ins_pipe( pipe_slow );
 3750 %}
 3751 
 3752 instruct reinterpret(vec dst) %{
 3753   predicate(!n->bottom_type()->isa_vectmask() &&
 3754             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3755   match(Set dst (VectorReinterpret dst));
 3756   ins_cost(125);
 3757   format %{ "vector_reinterpret $dst\t!" %}
 3758   ins_encode %{
 3759     // empty
 3760   %}
 3761   ins_pipe( pipe_slow );
 3762 %}
 3763 
 3764 instruct reinterpret_expand(vec dst, vec src) %{
 3765   predicate(UseAVX == 0 &&
 3766             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3767   match(Set dst (VectorReinterpret src));
 3768   ins_cost(125);
 3769   effect(TEMP dst);
 3770   format %{ "vector_reinterpret_expand $dst,$src" %}
 3771   ins_encode %{
 3772     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3773     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3774 
 3775     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3776     if (src_vlen_in_bytes == 4) {
 3777       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3778     } else {
 3779       assert(src_vlen_in_bytes == 8, "");
 3780       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3781     }
 3782     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3783   %}
 3784   ins_pipe( pipe_slow );
 3785 %}
 3786 
 3787 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3788   predicate(UseAVX > 0 &&
 3789             !n->bottom_type()->isa_vectmask() &&
 3790             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3791             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3792   match(Set dst (VectorReinterpret src));
 3793   ins_cost(125);
 3794   format %{ "vector_reinterpret_expand $dst,$src" %}
 3795   ins_encode %{
 3796     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3797   %}
 3798   ins_pipe( pipe_slow );
 3799 %}
 3800 
 3801 
 3802 instruct vreinterpret_expand(legVec dst, vec src) %{
 3803   predicate(UseAVX > 0 &&
 3804             !n->bottom_type()->isa_vectmask() &&
 3805             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3806             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3807   match(Set dst (VectorReinterpret src));
 3808   ins_cost(125);
 3809   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3810   ins_encode %{
 3811     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3812       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3813       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3814       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3815       default: ShouldNotReachHere();
 3816     }
 3817   %}
 3818   ins_pipe( pipe_slow );
 3819 %}
 3820 
 3821 instruct reinterpret_shrink(vec dst, legVec src) %{
 3822   predicate(!n->bottom_type()->isa_vectmask() &&
 3823             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3824   match(Set dst (VectorReinterpret src));
 3825   ins_cost(125);
 3826   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3827   ins_encode %{
 3828     switch (Matcher::vector_length_in_bytes(this)) {
 3829       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3830       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3831       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3832       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3833       default: ShouldNotReachHere();
 3834     }
 3835   %}
 3836   ins_pipe( pipe_slow );
 3837 %}
 3838 
 3839 // ----------------------------------------------------------------------------------------------------
 3840 
 3841 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3842   match(Set dst (RoundDoubleMode src rmode));
 3843   format %{ "roundsd $dst,$src" %}
 3844   ins_cost(150);
 3845   ins_encode %{
 3846     assert(UseSSE >= 4, "required");
 3847     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3848       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3849     }
 3850     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3851   %}
 3852   ins_pipe(pipe_slow);
 3853 %}
 3854 
 3855 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3856   match(Set dst (RoundDoubleMode con rmode));
 3857   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3858   ins_cost(150);
 3859   ins_encode %{
 3860     assert(UseSSE >= 4, "required");
 3861     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3862   %}
 3863   ins_pipe(pipe_slow);
 3864 %}
 3865 
 3866 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3867   predicate(Matcher::vector_length(n) < 8);
 3868   match(Set dst (RoundDoubleModeV src rmode));
 3869   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3870   ins_encode %{
 3871     assert(UseAVX > 0, "required");
 3872     int vlen_enc = vector_length_encoding(this);
 3873     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3874   %}
 3875   ins_pipe( pipe_slow );
 3876 %}
 3877 
 3878 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3879   predicate(Matcher::vector_length(n) == 8);
 3880   match(Set dst (RoundDoubleModeV src rmode));
 3881   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3882   ins_encode %{
 3883     assert(UseAVX > 2, "required");
 3884     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3885   %}
 3886   ins_pipe( pipe_slow );
 3887 %}
 3888 
 3889 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3890   predicate(Matcher::vector_length(n) < 8);
 3891   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3892   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3893   ins_encode %{
 3894     assert(UseAVX > 0, "required");
 3895     int vlen_enc = vector_length_encoding(this);
 3896     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3897   %}
 3898   ins_pipe( pipe_slow );
 3899 %}
 3900 
 3901 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3902   predicate(Matcher::vector_length(n) == 8);
 3903   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3904   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3905   ins_encode %{
 3906     assert(UseAVX > 2, "required");
 3907     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3908   %}
 3909   ins_pipe( pipe_slow );
 3910 %}
 3911 
 3912 instruct onspinwait() %{
 3913   match(OnSpinWait);
 3914   ins_cost(200);
 3915 
 3916   format %{
 3917     $$template
 3918     $$emit$$"pause\t! membar_onspinwait"
 3919   %}
 3920   ins_encode %{
 3921     __ pause();
 3922   %}
 3923   ins_pipe(pipe_slow);
 3924 %}
 3925 
 3926 // a * b + c
 3927 instruct fmaD_reg(regD a, regD b, regD c) %{
 3928   match(Set c (FmaD  c (Binary a b)));
 3929   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3930   ins_cost(150);
 3931   ins_encode %{
 3932     assert(UseFMA, "Needs FMA instructions support.");
 3933     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3934   %}
 3935   ins_pipe( pipe_slow );
 3936 %}
 3937 
 3938 // a * b + c
 3939 instruct fmaF_reg(regF a, regF b, regF c) %{
 3940   match(Set c (FmaF  c (Binary a b)));
 3941   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3942   ins_cost(150);
 3943   ins_encode %{
 3944     assert(UseFMA, "Needs FMA instructions support.");
 3945     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3946   %}
 3947   ins_pipe( pipe_slow );
 3948 %}
 3949 
 3950 // ====================VECTOR INSTRUCTIONS=====================================
 3951 
 3952 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3953 instruct MoveVec2Leg(legVec dst, vec src) %{
 3954   match(Set dst src);
 3955   format %{ "" %}
 3956   ins_encode %{
 3957     ShouldNotReachHere();
 3958   %}
 3959   ins_pipe( fpu_reg_reg );
 3960 %}
 3961 
 3962 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3963   match(Set dst src);
 3964   format %{ "" %}
 3965   ins_encode %{
 3966     ShouldNotReachHere();
 3967   %}
 3968   ins_pipe( fpu_reg_reg );
 3969 %}
 3970 
 3971 // ============================================================================
 3972 
 3973 // Load vectors generic operand pattern
 3974 instruct loadV(vec dst, memory mem) %{
 3975   match(Set dst (LoadVector mem));
 3976   ins_cost(125);
 3977   format %{ "load_vector $dst,$mem" %}
 3978   ins_encode %{
 3979     BasicType bt = Matcher::vector_element_basic_type(this);
 3980     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3981   %}
 3982   ins_pipe( pipe_slow );
 3983 %}
 3984 
 3985 // Store vectors generic operand pattern.
 3986 instruct storeV(memory mem, vec src) %{
 3987   match(Set mem (StoreVector mem src));
 3988   ins_cost(145);
 3989   format %{ "store_vector $mem,$src\n\t" %}
 3990   ins_encode %{
 3991     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3992       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3993       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3994       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3995       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3996       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3997       default: ShouldNotReachHere();
 3998     }
 3999   %}
 4000   ins_pipe( pipe_slow );
 4001 %}
 4002 
 4003 // ---------------------------------------- Gather ------------------------------------
 4004 
 4005 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4006 
 4007 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4008   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4009             Matcher::vector_length_in_bytes(n) <= 32);
 4010   match(Set dst (LoadVectorGather mem idx));
 4011   effect(TEMP dst, TEMP tmp, TEMP mask);
 4012   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4013   ins_encode %{
 4014     int vlen_enc = vector_length_encoding(this);
 4015     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4016     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4017     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4018     __ lea($tmp$$Register, $mem$$Address);
 4019     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4020   %}
 4021   ins_pipe( pipe_slow );
 4022 %}
 4023 
 4024 
 4025 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4026   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4027             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4028   match(Set dst (LoadVectorGather mem idx));
 4029   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4030   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4031   ins_encode %{
 4032     int vlen_enc = vector_length_encoding(this);
 4033     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4034     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4035     __ lea($tmp$$Register, $mem$$Address);
 4036     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4037   %}
 4038   ins_pipe( pipe_slow );
 4039 %}
 4040 
 4041 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4042   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4043             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4044   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4045   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4046   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4047   ins_encode %{
 4048     assert(UseAVX > 2, "sanity");
 4049     int vlen_enc = vector_length_encoding(this);
 4050     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4051     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4052     // Note: Since gather instruction partially updates the opmask register used
 4053     // for predication hense moving mask operand to a temporary.
 4054     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4055     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4056     __ lea($tmp$$Register, $mem$$Address);
 4057     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4058   %}
 4059   ins_pipe( pipe_slow );
 4060 %}
 4061 
 4062 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegI rtmp) %{
 4063   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4064   match(Set dst (LoadVectorGather mem idx_base));
 4065   effect(TEMP tmp, TEMP rtmp);
 4066   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4067   ins_encode %{
 4068     int vlen_enc = vector_length_encoding(this);
 4069     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4070     __ lea($tmp$$Register, $mem$$Address);
 4071     __ vgather8b(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp$$Register, vlen_enc);
 4072   %}
 4073   ins_pipe( pipe_slow );
 4074 %}
 4075 
 4076 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegP idx_base_temp,
 4077                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4078   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4079   match(Set dst (LoadVectorGather mem idx_base));
 4080   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4081   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4082   ins_encode %{
 4083     int vlen_enc = vector_length_encoding(this);
 4084     int vector_len = Matcher::vector_length(this);
 4085     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4086     __ lea($tmp$$Register, $mem$$Address);
 4087     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4088     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $xtmp1$$XMMRegister,
 4089                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4090   %}
 4091   ins_pipe( pipe_slow );
 4092 %}
 4093 
 4094 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4095   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4096   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4097   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4098   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4099   ins_encode %{
 4100     int vlen_enc = vector_length_encoding(this);
 4101     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4102     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4103     __ lea($tmp$$Register, $mem$$Address);
 4104     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4105     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4106   %}
 4107   ins_pipe( pipe_slow );
 4108 %}
 4109 
 4110 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4111                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4112   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4113   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4114   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4115   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4116   ins_encode %{
 4117     int vlen_enc = vector_length_encoding(this);
 4118     int vector_len = Matcher::vector_length(this);
 4119     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4120     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4121     __ lea($tmp$$Register, $mem$$Address);
 4122     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4123     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4124     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4125                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4126   %}
 4127   ins_pipe( pipe_slow );
 4128 %}
 4129 
 4130 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4131   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4132   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4133   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4134   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4135   ins_encode %{
 4136     int vlen_enc = vector_length_encoding(this);
 4137     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4138     __ lea($tmp$$Register, $mem$$Address);
 4139     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4140     if (elem_bt == T_SHORT) {
 4141       __ movl($mask_idx$$Register, 0x55555555);
 4142       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4143     }
 4144     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4145     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4146   %}
 4147   ins_pipe( pipe_slow );
 4148 %}
 4149 
 4150 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegP tmp, rRegP idx_base_temp,
 4151                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4152   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4153   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4154   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4155   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4156   ins_encode %{
 4157     int vlen_enc = vector_length_encoding(this);
 4158     int vector_len = Matcher::vector_length(this);
 4159     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4160     __ lea($tmp$$Register, $mem$$Address);
 4161     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4162     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4163     if (elem_bt == T_SHORT) {
 4164       __ movl($mask_idx$$Register, 0x55555555);
 4165       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4166     }
 4167     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4168     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4169                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4170   %}
 4171   ins_pipe( pipe_slow );
 4172 %}
 4173 
 4174 // ====================Scatter=======================================
 4175 
 4176 // Scatter INT, LONG, FLOAT, DOUBLE
 4177 
 4178 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4179   predicate(UseAVX > 2);
 4180   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4181   effect(TEMP tmp, TEMP ktmp);
 4182   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4183   ins_encode %{
 4184     int vlen_enc = vector_length_encoding(this, $src);
 4185     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4186 
 4187     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4188     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4189 
 4190     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4191     __ lea($tmp$$Register, $mem$$Address);
 4192     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4193   %}
 4194   ins_pipe( pipe_slow );
 4195 %}
 4196 
 4197 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4198   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4199   effect(TEMP tmp, TEMP ktmp);
 4200   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4201   ins_encode %{
 4202     int vlen_enc = vector_length_encoding(this, $src);
 4203     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4204     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4205     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4206     // Note: Since scatter instruction partially updates the opmask register used
 4207     // for predication hense moving mask operand to a temporary.
 4208     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4209     __ lea($tmp$$Register, $mem$$Address);
 4210     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4211   %}
 4212   ins_pipe( pipe_slow );
 4213 %}
 4214 
 4215 // ====================REPLICATE=======================================
 4216 
 4217 // Replicate byte scalar to be vector
 4218 instruct vReplB_reg(vec dst, rRegI src) %{
 4219   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4220   match(Set dst (Replicate src));
 4221   format %{ "replicateB $dst,$src" %}
 4222   ins_encode %{
 4223     uint vlen = Matcher::vector_length(this);
 4224     if (UseAVX >= 2) {
 4225       int vlen_enc = vector_length_encoding(this);
 4226       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4227         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4228         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4229       } else {
 4230         __ movdl($dst$$XMMRegister, $src$$Register);
 4231         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4232       }
 4233     } else {
 4234        assert(UseAVX < 2, "");
 4235       __ movdl($dst$$XMMRegister, $src$$Register);
 4236       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4237       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4238       if (vlen >= 16) {
 4239         assert(vlen == 16, "");
 4240         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4241       }
 4242     }
 4243   %}
 4244   ins_pipe( pipe_slow );
 4245 %}
 4246 
 4247 instruct ReplB_mem(vec dst, memory mem) %{
 4248   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4249   match(Set dst (Replicate (LoadB mem)));
 4250   format %{ "replicateB $dst,$mem" %}
 4251   ins_encode %{
 4252     int vlen_enc = vector_length_encoding(this);
 4253     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4254   %}
 4255   ins_pipe( pipe_slow );
 4256 %}
 4257 
 4258 // ====================ReplicateS=======================================
 4259 
 4260 instruct vReplS_reg(vec dst, rRegI src) %{
 4261   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4262   match(Set dst (Replicate src));
 4263   format %{ "replicateS $dst,$src" %}
 4264   ins_encode %{
 4265     uint vlen = Matcher::vector_length(this);
 4266     int vlen_enc = vector_length_encoding(this);
 4267     if (UseAVX >= 2) {
 4268       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4269         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4270         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4271       } else {
 4272         __ movdl($dst$$XMMRegister, $src$$Register);
 4273         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4274       }
 4275     } else {
 4276       assert(UseAVX < 2, "");
 4277       __ movdl($dst$$XMMRegister, $src$$Register);
 4278       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4279       if (vlen >= 8) {
 4280         assert(vlen == 8, "");
 4281         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4282       }
 4283     }
 4284   %}
 4285   ins_pipe( pipe_slow );
 4286 %}
 4287 
 4288 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4289   match(Set dst (Replicate con));
 4290   effect(TEMP rtmp);
 4291   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4292   ins_encode %{
 4293     int vlen_enc = vector_length_encoding(this);
 4294     BasicType bt = Matcher::vector_element_basic_type(this);
 4295     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4296     __ movl($rtmp$$Register, $con$$constant);
 4297     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4298   %}
 4299   ins_pipe( pipe_slow );
 4300 %}
 4301 
 4302 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4303   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4304   match(Set dst (Replicate src));
 4305   effect(TEMP rtmp);
 4306   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4307   ins_encode %{
 4308     int vlen_enc = vector_length_encoding(this);
 4309     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4310     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4311   %}
 4312   ins_pipe( pipe_slow );
 4313 %}
 4314 
 4315 instruct ReplS_mem(vec dst, memory mem) %{
 4316   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4317   match(Set dst (Replicate (LoadS mem)));
 4318   format %{ "replicateS $dst,$mem" %}
 4319   ins_encode %{
 4320     int vlen_enc = vector_length_encoding(this);
 4321     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4322   %}
 4323   ins_pipe( pipe_slow );
 4324 %}
 4325 
 4326 // ====================ReplicateI=======================================
 4327 
 4328 instruct ReplI_reg(vec dst, rRegI src) %{
 4329   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4330   match(Set dst (Replicate src));
 4331   format %{ "replicateI $dst,$src" %}
 4332   ins_encode %{
 4333     uint vlen = Matcher::vector_length(this);
 4334     int vlen_enc = vector_length_encoding(this);
 4335     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4336       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4337     } else if (VM_Version::supports_avx2()) {
 4338       __ movdl($dst$$XMMRegister, $src$$Register);
 4339       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4340     } else {
 4341       __ movdl($dst$$XMMRegister, $src$$Register);
 4342       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4343     }
 4344   %}
 4345   ins_pipe( pipe_slow );
 4346 %}
 4347 
 4348 instruct ReplI_mem(vec dst, memory mem) %{
 4349   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4350   match(Set dst (Replicate (LoadI mem)));
 4351   format %{ "replicateI $dst,$mem" %}
 4352   ins_encode %{
 4353     int vlen_enc = vector_length_encoding(this);
 4354     if (VM_Version::supports_avx2()) {
 4355       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4356     } else if (VM_Version::supports_avx()) {
 4357       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4358     } else {
 4359       __ movdl($dst$$XMMRegister, $mem$$Address);
 4360       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4361     }
 4362   %}
 4363   ins_pipe( pipe_slow );
 4364 %}
 4365 
 4366 instruct ReplI_imm(vec dst, immI con) %{
 4367   predicate(Matcher::is_non_long_integral_vector(n));
 4368   match(Set dst (Replicate con));
 4369   format %{ "replicateI $dst,$con" %}
 4370   ins_encode %{
 4371     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4372                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4373                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4374     BasicType bt = Matcher::vector_element_basic_type(this);
 4375     int vlen = Matcher::vector_length_in_bytes(this);
 4376     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4377   %}
 4378   ins_pipe( pipe_slow );
 4379 %}
 4380 
 4381 // Replicate scalar zero to be vector
 4382 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4383   predicate(Matcher::is_non_long_integral_vector(n));
 4384   match(Set dst (Replicate zero));
 4385   format %{ "replicateI $dst,$zero" %}
 4386   ins_encode %{
 4387     int vlen_enc = vector_length_encoding(this);
 4388     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4389       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4390     } else {
 4391       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4392     }
 4393   %}
 4394   ins_pipe( fpu_reg_reg );
 4395 %}
 4396 
 4397 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4398   predicate(Matcher::is_non_long_integral_vector(n));
 4399   match(Set dst (Replicate con));
 4400   format %{ "vallones $dst" %}
 4401   ins_encode %{
 4402     int vector_len = vector_length_encoding(this);
 4403     __ vallones($dst$$XMMRegister, vector_len);
 4404   %}
 4405   ins_pipe( pipe_slow );
 4406 %}
 4407 
 4408 // ====================ReplicateL=======================================
 4409 
 4410 // Replicate long (8 byte) scalar to be vector
 4411 instruct ReplL_reg(vec dst, rRegL src) %{
 4412   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4413   match(Set dst (Replicate src));
 4414   format %{ "replicateL $dst,$src" %}
 4415   ins_encode %{
 4416     int vlen = Matcher::vector_length(this);
 4417     int vlen_enc = vector_length_encoding(this);
 4418     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4419       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4420     } else if (VM_Version::supports_avx2()) {
 4421       __ movdq($dst$$XMMRegister, $src$$Register);
 4422       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4423     } else {
 4424       __ movdq($dst$$XMMRegister, $src$$Register);
 4425       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4426     }
 4427   %}
 4428   ins_pipe( pipe_slow );
 4429 %}
 4430 
 4431 instruct ReplL_mem(vec dst, memory mem) %{
 4432   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4433   match(Set dst (Replicate (LoadL mem)));
 4434   format %{ "replicateL $dst,$mem" %}
 4435   ins_encode %{
 4436     int vlen_enc = vector_length_encoding(this);
 4437     if (VM_Version::supports_avx2()) {
 4438       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4439     } else if (VM_Version::supports_sse3()) {
 4440       __ movddup($dst$$XMMRegister, $mem$$Address);
 4441     } else {
 4442       __ movq($dst$$XMMRegister, $mem$$Address);
 4443       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4444     }
 4445   %}
 4446   ins_pipe( pipe_slow );
 4447 %}
 4448 
 4449 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4450 instruct ReplL_imm(vec dst, immL con) %{
 4451   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4452   match(Set dst (Replicate con));
 4453   format %{ "replicateL $dst,$con" %}
 4454   ins_encode %{
 4455     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4456     int vlen = Matcher::vector_length_in_bytes(this);
 4457     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4458   %}
 4459   ins_pipe( pipe_slow );
 4460 %}
 4461 
 4462 instruct ReplL_zero(vec dst, immL0 zero) %{
 4463   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4464   match(Set dst (Replicate zero));
 4465   format %{ "replicateL $dst,$zero" %}
 4466   ins_encode %{
 4467     int vlen_enc = vector_length_encoding(this);
 4468     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4469       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4470     } else {
 4471       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4472     }
 4473   %}
 4474   ins_pipe( fpu_reg_reg );
 4475 %}
 4476 
 4477 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4478   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4479   match(Set dst (Replicate con));
 4480   format %{ "vallones $dst" %}
 4481   ins_encode %{
 4482     int vector_len = vector_length_encoding(this);
 4483     __ vallones($dst$$XMMRegister, vector_len);
 4484   %}
 4485   ins_pipe( pipe_slow );
 4486 %}
 4487 
 4488 // ====================ReplicateF=======================================
 4489 
 4490 instruct vReplF_reg(vec dst, vlRegF src) %{
 4491   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4492   match(Set dst (Replicate src));
 4493   format %{ "replicateF $dst,$src" %}
 4494   ins_encode %{
 4495     uint vlen = Matcher::vector_length(this);
 4496     int vlen_enc = vector_length_encoding(this);
 4497     if (vlen <= 4) {
 4498       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4499     } else if (VM_Version::supports_avx2()) {
 4500       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4501     } else {
 4502       assert(vlen == 8, "sanity");
 4503       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4504       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4505     }
 4506   %}
 4507   ins_pipe( pipe_slow );
 4508 %}
 4509 
 4510 instruct ReplF_reg(vec dst, vlRegF src) %{
 4511   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4512   match(Set dst (Replicate src));
 4513   format %{ "replicateF $dst,$src" %}
 4514   ins_encode %{
 4515     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4516   %}
 4517   ins_pipe( pipe_slow );
 4518 %}
 4519 
 4520 instruct ReplF_mem(vec dst, memory mem) %{
 4521   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4522   match(Set dst (Replicate (LoadF mem)));
 4523   format %{ "replicateF $dst,$mem" %}
 4524   ins_encode %{
 4525     int vlen_enc = vector_length_encoding(this);
 4526     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4527   %}
 4528   ins_pipe( pipe_slow );
 4529 %}
 4530 
 4531 // Replicate float scalar immediate to be vector by loading from const table.
 4532 instruct ReplF_imm(vec dst, immF con) %{
 4533   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4534   match(Set dst (Replicate con));
 4535   format %{ "replicateF $dst,$con" %}
 4536   ins_encode %{
 4537     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4538                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4539     int vlen = Matcher::vector_length_in_bytes(this);
 4540     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4541   %}
 4542   ins_pipe( pipe_slow );
 4543 %}
 4544 
 4545 instruct ReplF_zero(vec dst, immF0 zero) %{
 4546   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4547   match(Set dst (Replicate zero));
 4548   format %{ "replicateF $dst,$zero" %}
 4549   ins_encode %{
 4550     int vlen_enc = vector_length_encoding(this);
 4551     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4552       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4553     } else {
 4554       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4555     }
 4556   %}
 4557   ins_pipe( fpu_reg_reg );
 4558 %}
 4559 
 4560 // ====================ReplicateD=======================================
 4561 
 4562 // Replicate double (8 bytes) scalar to be vector
 4563 instruct vReplD_reg(vec dst, vlRegD src) %{
 4564   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4565   match(Set dst (Replicate src));
 4566   format %{ "replicateD $dst,$src" %}
 4567   ins_encode %{
 4568     uint vlen = Matcher::vector_length(this);
 4569     int vlen_enc = vector_length_encoding(this);
 4570     if (vlen <= 2) {
 4571       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4572     } else if (VM_Version::supports_avx2()) {
 4573       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4574     } else {
 4575       assert(vlen == 4, "sanity");
 4576       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4577       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4578     }
 4579   %}
 4580   ins_pipe( pipe_slow );
 4581 %}
 4582 
 4583 instruct ReplD_reg(vec dst, vlRegD src) %{
 4584   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4585   match(Set dst (Replicate src));
 4586   format %{ "replicateD $dst,$src" %}
 4587   ins_encode %{
 4588     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4589   %}
 4590   ins_pipe( pipe_slow );
 4591 %}
 4592 
 4593 instruct ReplD_mem(vec dst, memory mem) %{
 4594   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4595   match(Set dst (Replicate (LoadD mem)));
 4596   format %{ "replicateD $dst,$mem" %}
 4597   ins_encode %{
 4598     if (Matcher::vector_length(this) >= 4) {
 4599       int vlen_enc = vector_length_encoding(this);
 4600       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4601     } else {
 4602       __ movddup($dst$$XMMRegister, $mem$$Address);
 4603     }
 4604   %}
 4605   ins_pipe( pipe_slow );
 4606 %}
 4607 
 4608 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4609 instruct ReplD_imm(vec dst, immD con) %{
 4610   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4611   match(Set dst (Replicate con));
 4612   format %{ "replicateD $dst,$con" %}
 4613   ins_encode %{
 4614     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4615     int vlen = Matcher::vector_length_in_bytes(this);
 4616     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4617   %}
 4618   ins_pipe( pipe_slow );
 4619 %}
 4620 
 4621 instruct ReplD_zero(vec dst, immD0 zero) %{
 4622   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4623   match(Set dst (Replicate zero));
 4624   format %{ "replicateD $dst,$zero" %}
 4625   ins_encode %{
 4626     int vlen_enc = vector_length_encoding(this);
 4627     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4628       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4629     } else {
 4630       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4631     }
 4632   %}
 4633   ins_pipe( fpu_reg_reg );
 4634 %}
 4635 
 4636 // ====================VECTOR INSERT=======================================
 4637 
 4638 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4639   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4640   match(Set dst (VectorInsert (Binary dst val) idx));
 4641   format %{ "vector_insert $dst,$val,$idx" %}
 4642   ins_encode %{
 4643     assert(UseSSE >= 4, "required");
 4644     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4645 
 4646     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4647 
 4648     assert(is_integral_type(elem_bt), "");
 4649     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4650 
 4651     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4652   %}
 4653   ins_pipe( pipe_slow );
 4654 %}
 4655 
 4656 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4657   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4658   match(Set dst (VectorInsert (Binary src val) idx));
 4659   effect(TEMP vtmp);
 4660   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4661   ins_encode %{
 4662     int vlen_enc = Assembler::AVX_256bit;
 4663     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4664     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4665     int log2epr = log2(elem_per_lane);
 4666 
 4667     assert(is_integral_type(elem_bt), "sanity");
 4668     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4669 
 4670     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4671     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4672     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4673     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4674     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4675   %}
 4676   ins_pipe( pipe_slow );
 4677 %}
 4678 
 4679 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4680   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4681   match(Set dst (VectorInsert (Binary src val) idx));
 4682   effect(TEMP vtmp);
 4683   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4684   ins_encode %{
 4685     assert(UseAVX > 2, "sanity");
 4686 
 4687     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4688     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4689     int log2epr = log2(elem_per_lane);
 4690 
 4691     assert(is_integral_type(elem_bt), "");
 4692     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4693 
 4694     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4695     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4696     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4697     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4698     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4699   %}
 4700   ins_pipe( pipe_slow );
 4701 %}
 4702 
 4703 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4704   predicate(Matcher::vector_length(n) == 2);
 4705   match(Set dst (VectorInsert (Binary dst val) idx));
 4706   format %{ "vector_insert $dst,$val,$idx" %}
 4707   ins_encode %{
 4708     assert(UseSSE >= 4, "required");
 4709     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4710     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4711 
 4712     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4713   %}
 4714   ins_pipe( pipe_slow );
 4715 %}
 4716 
 4717 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4718   predicate(Matcher::vector_length(n) == 4);
 4719   match(Set dst (VectorInsert (Binary src val) idx));
 4720   effect(TEMP vtmp);
 4721   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4722   ins_encode %{
 4723     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4724     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4725 
 4726     uint x_idx = $idx$$constant & right_n_bits(1);
 4727     uint y_idx = ($idx$$constant >> 1) & 1;
 4728     int vlen_enc = Assembler::AVX_256bit;
 4729     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4730     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4731     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4732   %}
 4733   ins_pipe( pipe_slow );
 4734 %}
 4735 
 4736 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4737   predicate(Matcher::vector_length(n) == 8);
 4738   match(Set dst (VectorInsert (Binary src val) idx));
 4739   effect(TEMP vtmp);
 4740   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4741   ins_encode %{
 4742     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4743     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4744 
 4745     uint x_idx = $idx$$constant & right_n_bits(1);
 4746     uint y_idx = ($idx$$constant >> 1) & 3;
 4747     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4748     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4749     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4750   %}
 4751   ins_pipe( pipe_slow );
 4752 %}
 4753 
 4754 instruct insertF(vec dst, regF val, immU8 idx) %{
 4755   predicate(Matcher::vector_length(n) < 8);
 4756   match(Set dst (VectorInsert (Binary dst val) idx));
 4757   format %{ "vector_insert $dst,$val,$idx" %}
 4758   ins_encode %{
 4759     assert(UseSSE >= 4, "sanity");
 4760 
 4761     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4762     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4763 
 4764     uint x_idx = $idx$$constant & right_n_bits(2);
 4765     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4766   %}
 4767   ins_pipe( pipe_slow );
 4768 %}
 4769 
 4770 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4771   predicate(Matcher::vector_length(n) >= 8);
 4772   match(Set dst (VectorInsert (Binary src val) idx));
 4773   effect(TEMP vtmp);
 4774   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4775   ins_encode %{
 4776     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4777     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4778 
 4779     int vlen = Matcher::vector_length(this);
 4780     uint x_idx = $idx$$constant & right_n_bits(2);
 4781     if (vlen == 8) {
 4782       uint y_idx = ($idx$$constant >> 2) & 1;
 4783       int vlen_enc = Assembler::AVX_256bit;
 4784       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4785       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4786       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4787     } else {
 4788       assert(vlen == 16, "sanity");
 4789       uint y_idx = ($idx$$constant >> 2) & 3;
 4790       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4791       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4792       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4793     }
 4794   %}
 4795   ins_pipe( pipe_slow );
 4796 %}
 4797 
 4798 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4799   predicate(Matcher::vector_length(n) == 2);
 4800   match(Set dst (VectorInsert (Binary dst val) idx));
 4801   effect(TEMP tmp);
 4802   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4803   ins_encode %{
 4804     assert(UseSSE >= 4, "sanity");
 4805     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4806     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4807 
 4808     __ movq($tmp$$Register, $val$$XMMRegister);
 4809     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4810   %}
 4811   ins_pipe( pipe_slow );
 4812 %}
 4813 
 4814 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4815   predicate(Matcher::vector_length(n) == 4);
 4816   match(Set dst (VectorInsert (Binary src val) idx));
 4817   effect(TEMP vtmp, TEMP tmp);
 4818   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4819   ins_encode %{
 4820     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4821     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4822 
 4823     uint x_idx = $idx$$constant & right_n_bits(1);
 4824     uint y_idx = ($idx$$constant >> 1) & 1;
 4825     int vlen_enc = Assembler::AVX_256bit;
 4826     __ movq($tmp$$Register, $val$$XMMRegister);
 4827     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4828     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4829     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4830   %}
 4831   ins_pipe( pipe_slow );
 4832 %}
 4833 
 4834 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4835   predicate(Matcher::vector_length(n) == 8);
 4836   match(Set dst (VectorInsert (Binary src val) idx));
 4837   effect(TEMP tmp, TEMP vtmp);
 4838   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4839   ins_encode %{
 4840     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4841     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4842 
 4843     uint x_idx = $idx$$constant & right_n_bits(1);
 4844     uint y_idx = ($idx$$constant >> 1) & 3;
 4845     __ movq($tmp$$Register, $val$$XMMRegister);
 4846     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4847     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4848     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4849   %}
 4850   ins_pipe( pipe_slow );
 4851 %}
 4852 
 4853 // ====================REDUCTION ARITHMETIC=======================================
 4854 
 4855 // =======================Int Reduction==========================================
 4856 
 4857 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4858   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4859   match(Set dst (AddReductionVI src1 src2));
 4860   match(Set dst (MulReductionVI src1 src2));
 4861   match(Set dst (AndReductionV  src1 src2));
 4862   match(Set dst ( OrReductionV  src1 src2));
 4863   match(Set dst (XorReductionV  src1 src2));
 4864   match(Set dst (MinReductionV  src1 src2));
 4865   match(Set dst (MaxReductionV  src1 src2));
 4866   effect(TEMP vtmp1, TEMP vtmp2);
 4867   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4868   ins_encode %{
 4869     int opcode = this->ideal_Opcode();
 4870     int vlen = Matcher::vector_length(this, $src2);
 4871     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4872   %}
 4873   ins_pipe( pipe_slow );
 4874 %}
 4875 
 4876 // =======================Long Reduction==========================================
 4877 
 4878 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4879   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4880   match(Set dst (AddReductionVL src1 src2));
 4881   match(Set dst (MulReductionVL src1 src2));
 4882   match(Set dst (AndReductionV  src1 src2));
 4883   match(Set dst ( OrReductionV  src1 src2));
 4884   match(Set dst (XorReductionV  src1 src2));
 4885   match(Set dst (MinReductionV  src1 src2));
 4886   match(Set dst (MaxReductionV  src1 src2));
 4887   effect(TEMP vtmp1, TEMP vtmp2);
 4888   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4889   ins_encode %{
 4890     int opcode = this->ideal_Opcode();
 4891     int vlen = Matcher::vector_length(this, $src2);
 4892     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4893   %}
 4894   ins_pipe( pipe_slow );
 4895 %}
 4896 
 4897 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4898   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4899   match(Set dst (AddReductionVL src1 src2));
 4900   match(Set dst (MulReductionVL src1 src2));
 4901   match(Set dst (AndReductionV  src1 src2));
 4902   match(Set dst ( OrReductionV  src1 src2));
 4903   match(Set dst (XorReductionV  src1 src2));
 4904   match(Set dst (MinReductionV  src1 src2));
 4905   match(Set dst (MaxReductionV  src1 src2));
 4906   effect(TEMP vtmp1, TEMP vtmp2);
 4907   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4908   ins_encode %{
 4909     int opcode = this->ideal_Opcode();
 4910     int vlen = Matcher::vector_length(this, $src2);
 4911     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4912   %}
 4913   ins_pipe( pipe_slow );
 4914 %}
 4915 
 4916 // =======================Float Reduction==========================================
 4917 
 4918 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4919   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 4920   match(Set dst (AddReductionVF dst src));
 4921   match(Set dst (MulReductionVF dst src));
 4922   effect(TEMP dst, TEMP vtmp);
 4923   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4924   ins_encode %{
 4925     int opcode = this->ideal_Opcode();
 4926     int vlen = Matcher::vector_length(this, $src);
 4927     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4928   %}
 4929   ins_pipe( pipe_slow );
 4930 %}
 4931 
 4932 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4933   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 4934   match(Set dst (AddReductionVF dst src));
 4935   match(Set dst (MulReductionVF dst src));
 4936   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4937   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4938   ins_encode %{
 4939     int opcode = this->ideal_Opcode();
 4940     int vlen = Matcher::vector_length(this, $src);
 4941     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4942   %}
 4943   ins_pipe( pipe_slow );
 4944 %}
 4945 
 4946 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4947   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 4948   match(Set dst (AddReductionVF dst src));
 4949   match(Set dst (MulReductionVF dst src));
 4950   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4951   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4952   ins_encode %{
 4953     int opcode = this->ideal_Opcode();
 4954     int vlen = Matcher::vector_length(this, $src);
 4955     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4956   %}
 4957   ins_pipe( pipe_slow );
 4958 %}
 4959 
 4960 
 4961 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 4962   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4963   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4964   // src1 contains reduction identity
 4965   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 4966   match(Set dst (AddReductionVF src1 src2));
 4967   match(Set dst (MulReductionVF src1 src2));
 4968   effect(TEMP dst);
 4969   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 4970   ins_encode %{
 4971     int opcode = this->ideal_Opcode();
 4972     int vlen = Matcher::vector_length(this, $src2);
 4973     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 4974   %}
 4975   ins_pipe( pipe_slow );
 4976 %}
 4977 
 4978 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 4979   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4980   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4981   // src1 contains reduction identity
 4982   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 4983   match(Set dst (AddReductionVF src1 src2));
 4984   match(Set dst (MulReductionVF src1 src2));
 4985   effect(TEMP dst, TEMP vtmp);
 4986   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 4987   ins_encode %{
 4988     int opcode = this->ideal_Opcode();
 4989     int vlen = Matcher::vector_length(this, $src2);
 4990     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 4991   %}
 4992   ins_pipe( pipe_slow );
 4993 %}
 4994 
 4995 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 4996   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4997   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4998   // src1 contains reduction identity
 4999   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5000   match(Set dst (AddReductionVF src1 src2));
 5001   match(Set dst (MulReductionVF src1 src2));
 5002   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5003   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5004   ins_encode %{
 5005     int opcode = this->ideal_Opcode();
 5006     int vlen = Matcher::vector_length(this, $src2);
 5007     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5008   %}
 5009   ins_pipe( pipe_slow );
 5010 %}
 5011 
 5012 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5013   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5014   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5015   // src1 contains reduction identity
 5016   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5017   match(Set dst (AddReductionVF src1 src2));
 5018   match(Set dst (MulReductionVF src1 src2));
 5019   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5020   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5021   ins_encode %{
 5022     int opcode = this->ideal_Opcode();
 5023     int vlen = Matcher::vector_length(this, $src2);
 5024     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5025   %}
 5026   ins_pipe( pipe_slow );
 5027 %}
 5028 
 5029 // =======================Double Reduction==========================================
 5030 
 5031 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5032   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5033   match(Set dst (AddReductionVD dst src));
 5034   match(Set dst (MulReductionVD dst src));
 5035   effect(TEMP dst, TEMP vtmp);
 5036   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5037   ins_encode %{
 5038     int opcode = this->ideal_Opcode();
 5039     int vlen = Matcher::vector_length(this, $src);
 5040     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5041 %}
 5042   ins_pipe( pipe_slow );
 5043 %}
 5044 
 5045 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5046   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5047   match(Set dst (AddReductionVD dst src));
 5048   match(Set dst (MulReductionVD dst src));
 5049   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5050   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5051   ins_encode %{
 5052     int opcode = this->ideal_Opcode();
 5053     int vlen = Matcher::vector_length(this, $src);
 5054     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5055   %}
 5056   ins_pipe( pipe_slow );
 5057 %}
 5058 
 5059 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5060   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5061   match(Set dst (AddReductionVD dst src));
 5062   match(Set dst (MulReductionVD dst src));
 5063   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5064   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5065   ins_encode %{
 5066     int opcode = this->ideal_Opcode();
 5067     int vlen = Matcher::vector_length(this, $src);
 5068     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5069   %}
 5070   ins_pipe( pipe_slow );
 5071 %}
 5072 
 5073 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5074   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5075   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5076   // src1 contains reduction identity
 5077   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5078   match(Set dst (AddReductionVD src1 src2));
 5079   match(Set dst (MulReductionVD src1 src2));
 5080   effect(TEMP dst);
 5081   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5082   ins_encode %{
 5083     int opcode = this->ideal_Opcode();
 5084     int vlen = Matcher::vector_length(this, $src2);
 5085     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5086 %}
 5087   ins_pipe( pipe_slow );
 5088 %}
 5089 
 5090 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5091   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5092   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5093   // src1 contains reduction identity
 5094   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5095   match(Set dst (AddReductionVD src1 src2));
 5096   match(Set dst (MulReductionVD src1 src2));
 5097   effect(TEMP dst, TEMP vtmp);
 5098   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5099   ins_encode %{
 5100     int opcode = this->ideal_Opcode();
 5101     int vlen = Matcher::vector_length(this, $src2);
 5102     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5103   %}
 5104   ins_pipe( pipe_slow );
 5105 %}
 5106 
 5107 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5108   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5109   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5110   // src1 contains reduction identity
 5111   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5112   match(Set dst (AddReductionVD src1 src2));
 5113   match(Set dst (MulReductionVD src1 src2));
 5114   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5115   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5116   ins_encode %{
 5117     int opcode = this->ideal_Opcode();
 5118     int vlen = Matcher::vector_length(this, $src2);
 5119     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5120   %}
 5121   ins_pipe( pipe_slow );
 5122 %}
 5123 
 5124 // =======================Byte Reduction==========================================
 5125 
 5126 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5127   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5128   match(Set dst (AddReductionVI src1 src2));
 5129   match(Set dst (AndReductionV  src1 src2));
 5130   match(Set dst ( OrReductionV  src1 src2));
 5131   match(Set dst (XorReductionV  src1 src2));
 5132   match(Set dst (MinReductionV  src1 src2));
 5133   match(Set dst (MaxReductionV  src1 src2));
 5134   effect(TEMP vtmp1, TEMP vtmp2);
 5135   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5136   ins_encode %{
 5137     int opcode = this->ideal_Opcode();
 5138     int vlen = Matcher::vector_length(this, $src2);
 5139     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5140   %}
 5141   ins_pipe( pipe_slow );
 5142 %}
 5143 
 5144 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5145   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5146   match(Set dst (AddReductionVI src1 src2));
 5147   match(Set dst (AndReductionV  src1 src2));
 5148   match(Set dst ( OrReductionV  src1 src2));
 5149   match(Set dst (XorReductionV  src1 src2));
 5150   match(Set dst (MinReductionV  src1 src2));
 5151   match(Set dst (MaxReductionV  src1 src2));
 5152   effect(TEMP vtmp1, TEMP vtmp2);
 5153   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5154   ins_encode %{
 5155     int opcode = this->ideal_Opcode();
 5156     int vlen = Matcher::vector_length(this, $src2);
 5157     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5158   %}
 5159   ins_pipe( pipe_slow );
 5160 %}
 5161 
 5162 // =======================Short Reduction==========================================
 5163 
 5164 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5165   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5166   match(Set dst (AddReductionVI src1 src2));
 5167   match(Set dst (MulReductionVI src1 src2));
 5168   match(Set dst (AndReductionV  src1 src2));
 5169   match(Set dst ( OrReductionV  src1 src2));
 5170   match(Set dst (XorReductionV  src1 src2));
 5171   match(Set dst (MinReductionV  src1 src2));
 5172   match(Set dst (MaxReductionV  src1 src2));
 5173   effect(TEMP vtmp1, TEMP vtmp2);
 5174   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5175   ins_encode %{
 5176     int opcode = this->ideal_Opcode();
 5177     int vlen = Matcher::vector_length(this, $src2);
 5178     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5179   %}
 5180   ins_pipe( pipe_slow );
 5181 %}
 5182 
 5183 // =======================Mul Reduction==========================================
 5184 
 5185 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5186   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5187             Matcher::vector_length(n->in(2)) <= 32); // src2
 5188   match(Set dst (MulReductionVI src1 src2));
 5189   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5190   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5191   ins_encode %{
 5192     int opcode = this->ideal_Opcode();
 5193     int vlen = Matcher::vector_length(this, $src2);
 5194     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5195   %}
 5196   ins_pipe( pipe_slow );
 5197 %}
 5198 
 5199 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5200   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5201             Matcher::vector_length(n->in(2)) == 64); // src2
 5202   match(Set dst (MulReductionVI src1 src2));
 5203   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5204   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5205   ins_encode %{
 5206     int opcode = this->ideal_Opcode();
 5207     int vlen = Matcher::vector_length(this, $src2);
 5208     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5209   %}
 5210   ins_pipe( pipe_slow );
 5211 %}
 5212 
 5213 //--------------------Min/Max Float Reduction --------------------
 5214 // Float Min Reduction
 5215 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5216                             legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5217   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5218             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5219              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5220             Matcher::vector_length(n->in(2)) == 2);
 5221   match(Set dst (MinReductionV src1 src2));
 5222   match(Set dst (MaxReductionV src1 src2));
 5223   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5224   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5225   ins_encode %{
 5226     assert(UseAVX > 0, "sanity");
 5227 
 5228     int opcode = this->ideal_Opcode();
 5229     int vlen = Matcher::vector_length(this, $src2);
 5230     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5231                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5232   %}
 5233   ins_pipe( pipe_slow );
 5234 %}
 5235 
 5236 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5237                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5238   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5239             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5240              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5241             Matcher::vector_length(n->in(2)) >= 4);
 5242   match(Set dst (MinReductionV src1 src2));
 5243   match(Set dst (MaxReductionV src1 src2));
 5244   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5245   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5246   ins_encode %{
 5247     assert(UseAVX > 0, "sanity");
 5248 
 5249     int opcode = this->ideal_Opcode();
 5250     int vlen = Matcher::vector_length(this, $src2);
 5251     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5252                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5253   %}
 5254   ins_pipe( pipe_slow );
 5255 %}
 5256 
 5257 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp, legVec atmp,
 5258                                legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5259   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5260             Matcher::vector_length(n->in(2)) == 2);
 5261   match(Set dst (MinReductionV dst src));
 5262   match(Set dst (MaxReductionV dst src));
 5263   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5264   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5265   ins_encode %{
 5266     assert(UseAVX > 0, "sanity");
 5267 
 5268     int opcode = this->ideal_Opcode();
 5269     int vlen = Matcher::vector_length(this, $src);
 5270     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5271                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5272   %}
 5273   ins_pipe( pipe_slow );
 5274 %}
 5275 
 5276 
 5277 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp, legVec atmp, legVec btmp,
 5278                               legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5279   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5280             Matcher::vector_length(n->in(2)) >= 4);
 5281   match(Set dst (MinReductionV dst src));
 5282   match(Set dst (MaxReductionV dst src));
 5283   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5284   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5285   ins_encode %{
 5286     assert(UseAVX > 0, "sanity");
 5287 
 5288     int opcode = this->ideal_Opcode();
 5289     int vlen = Matcher::vector_length(this, $src);
 5290     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5291                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5292   %}
 5293   ins_pipe( pipe_slow );
 5294 %}
 5295 
 5296 instruct minmax_reduction2F_avx10(regF dst, immF src1, vec src2, vec xtmp1) %{
 5297   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5298             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5299              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5300             Matcher::vector_length(n->in(2)) == 2);
 5301   match(Set dst (MinReductionV src1 src2));
 5302   match(Set dst (MaxReductionV src1 src2));
 5303   effect(TEMP dst, TEMP xtmp1);
 5304   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 as TEMP" %}
 5305   ins_encode %{
 5306     int opcode = this->ideal_Opcode();
 5307     int vlen = Matcher::vector_length(this, $src2);
 5308     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5309                          xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5310   %}
 5311   ins_pipe( pipe_slow );
 5312 %}
 5313 
 5314 instruct minmax_reductionF_avx10(regF dst, immF src1, vec src2, vec xtmp1, vec xtmp2) %{
 5315   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5316             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5317              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5318             Matcher::vector_length(n->in(2)) >= 4);
 5319   match(Set dst (MinReductionV src1 src2));
 5320   match(Set dst (MaxReductionV src1 src2));
 5321   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5322   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5323   ins_encode %{
 5324     int opcode = this->ideal_Opcode();
 5325     int vlen = Matcher::vector_length(this, $src2);
 5326     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5327                          xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5328   %}
 5329   ins_pipe( pipe_slow );
 5330 %}
 5331 
 5332 instruct minmax_reduction2F_avx10_av(regF dst, vec src, vec xtmp1) %{
 5333   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5334             Matcher::vector_length(n->in(2)) == 2);
 5335   match(Set dst (MinReductionV dst src));
 5336   match(Set dst (MaxReductionV dst src));
 5337   effect(TEMP dst, TEMP xtmp1);
 5338   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 as TEMP" %}
 5339   ins_encode %{
 5340     int opcode = this->ideal_Opcode();
 5341     int vlen = Matcher::vector_length(this, $src);
 5342     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5343                          $xtmp1$$XMMRegister);
 5344   %}
 5345   ins_pipe( pipe_slow );
 5346 %}
 5347 
 5348 instruct minmax_reductionF_avx10_av(regF dst, vec src, vec xtmp1, vec xtmp2) %{
 5349   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5350             Matcher::vector_length(n->in(2)) >= 4);
 5351   match(Set dst (MinReductionV dst src));
 5352   match(Set dst (MaxReductionV dst src));
 5353   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5354   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5355   ins_encode %{
 5356     int opcode = this->ideal_Opcode();
 5357     int vlen = Matcher::vector_length(this, $src);
 5358     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5359                          $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5360   %}
 5361   ins_pipe( pipe_slow );
 5362 %}
 5363 
 5364 //--------------------Min Double Reduction --------------------
 5365 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5366                             legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5367   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5368             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5369              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5370             Matcher::vector_length(n->in(2)) == 2);
 5371   match(Set dst (MinReductionV src1 src2));
 5372   match(Set dst (MaxReductionV src1 src2));
 5373   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5374   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5375   ins_encode %{
 5376     assert(UseAVX > 0, "sanity");
 5377 
 5378     int opcode = this->ideal_Opcode();
 5379     int vlen = Matcher::vector_length(this, $src2);
 5380     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5381                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5382   %}
 5383   ins_pipe( pipe_slow );
 5384 %}
 5385 
 5386 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5387                            legVec tmp3, legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5388   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5389             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5390              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5391             Matcher::vector_length(n->in(2)) >= 4);
 5392   match(Set dst (MinReductionV src1 src2));
 5393   match(Set dst (MaxReductionV src1 src2));
 5394   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5395   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5396   ins_encode %{
 5397     assert(UseAVX > 0, "sanity");
 5398 
 5399     int opcode = this->ideal_Opcode();
 5400     int vlen = Matcher::vector_length(this, $src2);
 5401     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5402                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5403   %}
 5404   ins_pipe( pipe_slow );
 5405 %}
 5406 
 5407 
 5408 instruct minmax_reduction2D_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2,
 5409                                legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5410   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5411             Matcher::vector_length(n->in(2)) == 2);
 5412   match(Set dst (MinReductionV dst src));
 5413   match(Set dst (MaxReductionV dst src));
 5414   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5415   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5416   ins_encode %{
 5417     assert(UseAVX > 0, "sanity");
 5418 
 5419     int opcode = this->ideal_Opcode();
 5420     int vlen = Matcher::vector_length(this, $src);
 5421     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5422                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5423   %}
 5424   ins_pipe( pipe_slow );
 5425 %}
 5426 
 5427 instruct minmax_reductionD_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2, legVec tmp3,
 5428                               legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5429   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5430             Matcher::vector_length(n->in(2)) >= 4);
 5431   match(Set dst (MinReductionV dst src));
 5432   match(Set dst (MaxReductionV dst src));
 5433   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5434   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5435   ins_encode %{
 5436     assert(UseAVX > 0, "sanity");
 5437 
 5438     int opcode = this->ideal_Opcode();
 5439     int vlen = Matcher::vector_length(this, $src);
 5440     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5441                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5442   %}
 5443   ins_pipe( pipe_slow );
 5444 %}
 5445 
 5446 instruct minmax_reduction2D_avx10(regD dst, immD src1, vec src2, vec xtmp1) %{
 5447   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5448             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5449              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5450             Matcher::vector_length(n->in(2)) == 2);
 5451   match(Set dst (MinReductionV src1 src2));
 5452   match(Set dst (MaxReductionV src1 src2));
 5453   effect(TEMP dst, TEMP xtmp1);
 5454   format %{ "vector_minmax2D_reduction $dst, $src1, $src2 ; using $xtmp1 as TEMP" %}
 5455   ins_encode %{
 5456     int opcode = this->ideal_Opcode();
 5457     int vlen = Matcher::vector_length(this, $src2);
 5458     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg,
 5459                           xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5460   %}
 5461   ins_pipe( pipe_slow );
 5462 %}
 5463 
 5464 instruct minmax_reductionD_avx10(regD dst, immD src1, vec src2, vec xtmp1, vec xtmp2) %{
 5465   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5466             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5467              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5468             Matcher::vector_length(n->in(2)) >= 4);
 5469   match(Set dst (MinReductionV src1 src2));
 5470   match(Set dst (MaxReductionV src1 src2));
 5471   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5472   format %{ "vector_minmaxD_reduction $dst, $src1, $src2 ; using $xtmp1 and $xtmp2 as TEMP" %}
 5473   ins_encode %{
 5474     int opcode = this->ideal_Opcode();
 5475     int vlen = Matcher::vector_length(this, $src2);
 5476     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5477                           xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5478   %}
 5479   ins_pipe( pipe_slow );
 5480 %}
 5481 
 5482 
 5483 instruct minmax_reduction2D_av_avx10(regD dst, vec src, vec xtmp1) %{
 5484   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5485             Matcher::vector_length(n->in(2)) == 2);
 5486   match(Set dst (MinReductionV dst src));
 5487   match(Set dst (MaxReductionV dst src));
 5488   effect(TEMP dst, TEMP xtmp1);
 5489   format %{ "vector_minmax2D_reduction $dst, $src ; using $xtmp1 as TEMP" %}
 5490   ins_encode %{
 5491     int opcode = this->ideal_Opcode();
 5492     int vlen = Matcher::vector_length(this, $src);
 5493     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5494                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5495   %}
 5496   ins_pipe( pipe_slow );
 5497 %}
 5498 
 5499 instruct minmax_reductionD_av_avx10(regD dst, vec src, vec xtmp1, vec xtmp2) %{
 5500   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5501             Matcher::vector_length(n->in(2)) >= 4);
 5502   match(Set dst (MinReductionV dst src));
 5503   match(Set dst (MaxReductionV dst src));
 5504   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5505   format %{ "vector_minmaxD_reduction $dst, $src ; using $xtmp1 and $xtmp2 as TEMP" %}
 5506   ins_encode %{
 5507     int opcode = this->ideal_Opcode();
 5508     int vlen = Matcher::vector_length(this, $src);
 5509     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5510                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5511   %}
 5512   ins_pipe( pipe_slow );
 5513 %}
 5514 
 5515 // ====================VECTOR ARITHMETIC=======================================
 5516 
 5517 // --------------------------------- ADD --------------------------------------
 5518 
 5519 // Bytes vector add
 5520 instruct vaddB(vec dst, vec src) %{
 5521   predicate(UseAVX == 0);
 5522   match(Set dst (AddVB dst src));
 5523   format %{ "paddb   $dst,$src\t! add packedB" %}
 5524   ins_encode %{
 5525     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5526   %}
 5527   ins_pipe( pipe_slow );
 5528 %}
 5529 
 5530 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5531   predicate(UseAVX > 0);
 5532   match(Set dst (AddVB src1 src2));
 5533   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5534   ins_encode %{
 5535     int vlen_enc = vector_length_encoding(this);
 5536     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5537   %}
 5538   ins_pipe( pipe_slow );
 5539 %}
 5540 
 5541 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5542   predicate((UseAVX > 0) &&
 5543             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5544   match(Set dst (AddVB src (LoadVector mem)));
 5545   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5546   ins_encode %{
 5547     int vlen_enc = vector_length_encoding(this);
 5548     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5549   %}
 5550   ins_pipe( pipe_slow );
 5551 %}
 5552 
 5553 // Shorts/Chars vector add
 5554 instruct vaddS(vec dst, vec src) %{
 5555   predicate(UseAVX == 0);
 5556   match(Set dst (AddVS dst src));
 5557   format %{ "paddw   $dst,$src\t! add packedS" %}
 5558   ins_encode %{
 5559     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5560   %}
 5561   ins_pipe( pipe_slow );
 5562 %}
 5563 
 5564 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5565   predicate(UseAVX > 0);
 5566   match(Set dst (AddVS src1 src2));
 5567   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5568   ins_encode %{
 5569     int vlen_enc = vector_length_encoding(this);
 5570     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5571   %}
 5572   ins_pipe( pipe_slow );
 5573 %}
 5574 
 5575 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5576   predicate((UseAVX > 0) &&
 5577             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5578   match(Set dst (AddVS src (LoadVector mem)));
 5579   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5580   ins_encode %{
 5581     int vlen_enc = vector_length_encoding(this);
 5582     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5583   %}
 5584   ins_pipe( pipe_slow );
 5585 %}
 5586 
 5587 // Integers vector add
 5588 instruct vaddI(vec dst, vec src) %{
 5589   predicate(UseAVX == 0);
 5590   match(Set dst (AddVI dst src));
 5591   format %{ "paddd   $dst,$src\t! add packedI" %}
 5592   ins_encode %{
 5593     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5594   %}
 5595   ins_pipe( pipe_slow );
 5596 %}
 5597 
 5598 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5599   predicate(UseAVX > 0);
 5600   match(Set dst (AddVI src1 src2));
 5601   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5602   ins_encode %{
 5603     int vlen_enc = vector_length_encoding(this);
 5604     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5605   %}
 5606   ins_pipe( pipe_slow );
 5607 %}
 5608 
 5609 
 5610 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5611   predicate((UseAVX > 0) &&
 5612             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5613   match(Set dst (AddVI src (LoadVector mem)));
 5614   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5615   ins_encode %{
 5616     int vlen_enc = vector_length_encoding(this);
 5617     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5618   %}
 5619   ins_pipe( pipe_slow );
 5620 %}
 5621 
 5622 // Longs vector add
 5623 instruct vaddL(vec dst, vec src) %{
 5624   predicate(UseAVX == 0);
 5625   match(Set dst (AddVL dst src));
 5626   format %{ "paddq   $dst,$src\t! add packedL" %}
 5627   ins_encode %{
 5628     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5629   %}
 5630   ins_pipe( pipe_slow );
 5631 %}
 5632 
 5633 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5634   predicate(UseAVX > 0);
 5635   match(Set dst (AddVL src1 src2));
 5636   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5637   ins_encode %{
 5638     int vlen_enc = vector_length_encoding(this);
 5639     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5640   %}
 5641   ins_pipe( pipe_slow );
 5642 %}
 5643 
 5644 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5645   predicate((UseAVX > 0) &&
 5646             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5647   match(Set dst (AddVL src (LoadVector mem)));
 5648   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5649   ins_encode %{
 5650     int vlen_enc = vector_length_encoding(this);
 5651     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5652   %}
 5653   ins_pipe( pipe_slow );
 5654 %}
 5655 
 5656 // Floats vector add
 5657 instruct vaddF(vec dst, vec src) %{
 5658   predicate(UseAVX == 0);
 5659   match(Set dst (AddVF dst src));
 5660   format %{ "addps   $dst,$src\t! add packedF" %}
 5661   ins_encode %{
 5662     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5663   %}
 5664   ins_pipe( pipe_slow );
 5665 %}
 5666 
 5667 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5668   predicate(UseAVX > 0);
 5669   match(Set dst (AddVF src1 src2));
 5670   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5671   ins_encode %{
 5672     int vlen_enc = vector_length_encoding(this);
 5673     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5674   %}
 5675   ins_pipe( pipe_slow );
 5676 %}
 5677 
 5678 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5679   predicate((UseAVX > 0) &&
 5680             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5681   match(Set dst (AddVF src (LoadVector mem)));
 5682   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5683   ins_encode %{
 5684     int vlen_enc = vector_length_encoding(this);
 5685     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5686   %}
 5687   ins_pipe( pipe_slow );
 5688 %}
 5689 
 5690 // Doubles vector add
 5691 instruct vaddD(vec dst, vec src) %{
 5692   predicate(UseAVX == 0);
 5693   match(Set dst (AddVD dst src));
 5694   format %{ "addpd   $dst,$src\t! add packedD" %}
 5695   ins_encode %{
 5696     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5697   %}
 5698   ins_pipe( pipe_slow );
 5699 %}
 5700 
 5701 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5702   predicate(UseAVX > 0);
 5703   match(Set dst (AddVD src1 src2));
 5704   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5705   ins_encode %{
 5706     int vlen_enc = vector_length_encoding(this);
 5707     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5708   %}
 5709   ins_pipe( pipe_slow );
 5710 %}
 5711 
 5712 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5713   predicate((UseAVX > 0) &&
 5714             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5715   match(Set dst (AddVD src (LoadVector mem)));
 5716   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5717   ins_encode %{
 5718     int vlen_enc = vector_length_encoding(this);
 5719     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5720   %}
 5721   ins_pipe( pipe_slow );
 5722 %}
 5723 
 5724 // --------------------------------- SUB --------------------------------------
 5725 
 5726 // Bytes vector sub
 5727 instruct vsubB(vec dst, vec src) %{
 5728   predicate(UseAVX == 0);
 5729   match(Set dst (SubVB dst src));
 5730   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5731   ins_encode %{
 5732     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5733   %}
 5734   ins_pipe( pipe_slow );
 5735 %}
 5736 
 5737 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5738   predicate(UseAVX > 0);
 5739   match(Set dst (SubVB src1 src2));
 5740   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5741   ins_encode %{
 5742     int vlen_enc = vector_length_encoding(this);
 5743     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5744   %}
 5745   ins_pipe( pipe_slow );
 5746 %}
 5747 
 5748 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5749   predicate((UseAVX > 0) &&
 5750             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5751   match(Set dst (SubVB src (LoadVector mem)));
 5752   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5753   ins_encode %{
 5754     int vlen_enc = vector_length_encoding(this);
 5755     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5756   %}
 5757   ins_pipe( pipe_slow );
 5758 %}
 5759 
 5760 // Shorts/Chars vector sub
 5761 instruct vsubS(vec dst, vec src) %{
 5762   predicate(UseAVX == 0);
 5763   match(Set dst (SubVS dst src));
 5764   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5765   ins_encode %{
 5766     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5767   %}
 5768   ins_pipe( pipe_slow );
 5769 %}
 5770 
 5771 
 5772 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5773   predicate(UseAVX > 0);
 5774   match(Set dst (SubVS src1 src2));
 5775   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5776   ins_encode %{
 5777     int vlen_enc = vector_length_encoding(this);
 5778     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5779   %}
 5780   ins_pipe( pipe_slow );
 5781 %}
 5782 
 5783 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5784   predicate((UseAVX > 0) &&
 5785             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5786   match(Set dst (SubVS src (LoadVector mem)));
 5787   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5788   ins_encode %{
 5789     int vlen_enc = vector_length_encoding(this);
 5790     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5791   %}
 5792   ins_pipe( pipe_slow );
 5793 %}
 5794 
 5795 // Integers vector sub
 5796 instruct vsubI(vec dst, vec src) %{
 5797   predicate(UseAVX == 0);
 5798   match(Set dst (SubVI dst src));
 5799   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5800   ins_encode %{
 5801     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5802   %}
 5803   ins_pipe( pipe_slow );
 5804 %}
 5805 
 5806 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5807   predicate(UseAVX > 0);
 5808   match(Set dst (SubVI src1 src2));
 5809   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5810   ins_encode %{
 5811     int vlen_enc = vector_length_encoding(this);
 5812     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5813   %}
 5814   ins_pipe( pipe_slow );
 5815 %}
 5816 
 5817 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5818   predicate((UseAVX > 0) &&
 5819             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5820   match(Set dst (SubVI src (LoadVector mem)));
 5821   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5822   ins_encode %{
 5823     int vlen_enc = vector_length_encoding(this);
 5824     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5825   %}
 5826   ins_pipe( pipe_slow );
 5827 %}
 5828 
 5829 // Longs vector sub
 5830 instruct vsubL(vec dst, vec src) %{
 5831   predicate(UseAVX == 0);
 5832   match(Set dst (SubVL dst src));
 5833   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5834   ins_encode %{
 5835     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5836   %}
 5837   ins_pipe( pipe_slow );
 5838 %}
 5839 
 5840 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5841   predicate(UseAVX > 0);
 5842   match(Set dst (SubVL src1 src2));
 5843   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5844   ins_encode %{
 5845     int vlen_enc = vector_length_encoding(this);
 5846     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5847   %}
 5848   ins_pipe( pipe_slow );
 5849 %}
 5850 
 5851 
 5852 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5853   predicate((UseAVX > 0) &&
 5854             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5855   match(Set dst (SubVL src (LoadVector mem)));
 5856   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5857   ins_encode %{
 5858     int vlen_enc = vector_length_encoding(this);
 5859     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5860   %}
 5861   ins_pipe( pipe_slow );
 5862 %}
 5863 
 5864 // Floats vector sub
 5865 instruct vsubF(vec dst, vec src) %{
 5866   predicate(UseAVX == 0);
 5867   match(Set dst (SubVF dst src));
 5868   format %{ "subps   $dst,$src\t! sub packedF" %}
 5869   ins_encode %{
 5870     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5871   %}
 5872   ins_pipe( pipe_slow );
 5873 %}
 5874 
 5875 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5876   predicate(UseAVX > 0);
 5877   match(Set dst (SubVF src1 src2));
 5878   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5879   ins_encode %{
 5880     int vlen_enc = vector_length_encoding(this);
 5881     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5882   %}
 5883   ins_pipe( pipe_slow );
 5884 %}
 5885 
 5886 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5887   predicate((UseAVX > 0) &&
 5888             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5889   match(Set dst (SubVF src (LoadVector mem)));
 5890   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5891   ins_encode %{
 5892     int vlen_enc = vector_length_encoding(this);
 5893     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5894   %}
 5895   ins_pipe( pipe_slow );
 5896 %}
 5897 
 5898 // Doubles vector sub
 5899 instruct vsubD(vec dst, vec src) %{
 5900   predicate(UseAVX == 0);
 5901   match(Set dst (SubVD dst src));
 5902   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5903   ins_encode %{
 5904     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5905   %}
 5906   ins_pipe( pipe_slow );
 5907 %}
 5908 
 5909 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5910   predicate(UseAVX > 0);
 5911   match(Set dst (SubVD src1 src2));
 5912   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5913   ins_encode %{
 5914     int vlen_enc = vector_length_encoding(this);
 5915     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5916   %}
 5917   ins_pipe( pipe_slow );
 5918 %}
 5919 
 5920 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5921   predicate((UseAVX > 0) &&
 5922             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5923   match(Set dst (SubVD src (LoadVector mem)));
 5924   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5925   ins_encode %{
 5926     int vlen_enc = vector_length_encoding(this);
 5927     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5928   %}
 5929   ins_pipe( pipe_slow );
 5930 %}
 5931 
 5932 // --------------------------------- MUL --------------------------------------
 5933 
 5934 // Byte vector mul
 5935 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5936   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5937   match(Set dst (MulVB src1 src2));
 5938   effect(TEMP dst, TEMP xtmp);
 5939   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5940   ins_encode %{
 5941     assert(UseSSE > 3, "required");
 5942     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5943     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5944     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5945     __ psllw($dst$$XMMRegister, 8);
 5946     __ psrlw($dst$$XMMRegister, 8);
 5947     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5948   %}
 5949   ins_pipe( pipe_slow );
 5950 %}
 5951 
 5952 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5953   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5954   match(Set dst (MulVB src1 src2));
 5955   effect(TEMP dst, TEMP xtmp);
 5956   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5957   ins_encode %{
 5958     assert(UseSSE > 3, "required");
 5959     // Odd-index elements
 5960     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5961     __ psrlw($dst$$XMMRegister, 8);
 5962     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5963     __ psrlw($xtmp$$XMMRegister, 8);
 5964     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5965     __ psllw($dst$$XMMRegister, 8);
 5966     // Even-index elements
 5967     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5968     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5969     __ psllw($xtmp$$XMMRegister, 8);
 5970     __ psrlw($xtmp$$XMMRegister, 8);
 5971     // Combine
 5972     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5973   %}
 5974   ins_pipe( pipe_slow );
 5975 %}
 5976 
 5977 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5978   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5979   match(Set dst (MulVB src1 src2));
 5980   effect(TEMP xtmp1, TEMP xtmp2);
 5981   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5982   ins_encode %{
 5983     int vlen_enc = vector_length_encoding(this);
 5984     // Odd-index elements
 5985     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5986     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5987     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5988     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5989     // Even-index elements
 5990     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5991     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5992     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5993     // Combine
 5994     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5995   %}
 5996   ins_pipe( pipe_slow );
 5997 %}
 5998 
 5999 // Shorts/Chars vector mul
 6000 instruct vmulS(vec dst, vec src) %{
 6001   predicate(UseAVX == 0);
 6002   match(Set dst (MulVS dst src));
 6003   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6004   ins_encode %{
 6005     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6006   %}
 6007   ins_pipe( pipe_slow );
 6008 %}
 6009 
 6010 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6011   predicate(UseAVX > 0);
 6012   match(Set dst (MulVS src1 src2));
 6013   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6014   ins_encode %{
 6015     int vlen_enc = vector_length_encoding(this);
 6016     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6017   %}
 6018   ins_pipe( pipe_slow );
 6019 %}
 6020 
 6021 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6022   predicate((UseAVX > 0) &&
 6023             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6024   match(Set dst (MulVS src (LoadVector mem)));
 6025   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6026   ins_encode %{
 6027     int vlen_enc = vector_length_encoding(this);
 6028     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6029   %}
 6030   ins_pipe( pipe_slow );
 6031 %}
 6032 
 6033 // Integers vector mul
 6034 instruct vmulI(vec dst, vec src) %{
 6035   predicate(UseAVX == 0);
 6036   match(Set dst (MulVI dst src));
 6037   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6038   ins_encode %{
 6039     assert(UseSSE > 3, "required");
 6040     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6041   %}
 6042   ins_pipe( pipe_slow );
 6043 %}
 6044 
 6045 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6046   predicate(UseAVX > 0);
 6047   match(Set dst (MulVI src1 src2));
 6048   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6049   ins_encode %{
 6050     int vlen_enc = vector_length_encoding(this);
 6051     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6052   %}
 6053   ins_pipe( pipe_slow );
 6054 %}
 6055 
 6056 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6057   predicate((UseAVX > 0) &&
 6058             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6059   match(Set dst (MulVI src (LoadVector mem)));
 6060   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6061   ins_encode %{
 6062     int vlen_enc = vector_length_encoding(this);
 6063     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6064   %}
 6065   ins_pipe( pipe_slow );
 6066 %}
 6067 
 6068 // Longs vector mul
 6069 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6070   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6071              VM_Version::supports_avx512dq()) ||
 6072             VM_Version::supports_avx512vldq());
 6073   match(Set dst (MulVL src1 src2));
 6074   ins_cost(500);
 6075   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6076   ins_encode %{
 6077     assert(UseAVX > 2, "required");
 6078     int vlen_enc = vector_length_encoding(this);
 6079     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6080   %}
 6081   ins_pipe( pipe_slow );
 6082 %}
 6083 
 6084 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6085   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6086              VM_Version::supports_avx512dq()) ||
 6087             (Matcher::vector_length_in_bytes(n) > 8 &&
 6088              VM_Version::supports_avx512vldq()));
 6089   match(Set dst (MulVL src (LoadVector mem)));
 6090   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6091   ins_cost(500);
 6092   ins_encode %{
 6093     assert(UseAVX > 2, "required");
 6094     int vlen_enc = vector_length_encoding(this);
 6095     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6096   %}
 6097   ins_pipe( pipe_slow );
 6098 %}
 6099 
 6100 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6101   predicate(UseAVX == 0);
 6102   match(Set dst (MulVL src1 src2));
 6103   ins_cost(500);
 6104   effect(TEMP dst, TEMP xtmp);
 6105   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6106   ins_encode %{
 6107     assert(VM_Version::supports_sse4_1(), "required");
 6108     // Get the lo-hi products, only the lower 32 bits is in concerns
 6109     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6110     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6111     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6112     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6113     __ psllq($dst$$XMMRegister, 32);
 6114     // Get the lo-lo products
 6115     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6116     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6117     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6118   %}
 6119   ins_pipe( pipe_slow );
 6120 %}
 6121 
 6122 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6123   predicate(UseAVX > 0 &&
 6124             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6125               !VM_Version::supports_avx512dq()) ||
 6126              (Matcher::vector_length_in_bytes(n) < 64 &&
 6127               !VM_Version::supports_avx512vldq())));
 6128   match(Set dst (MulVL src1 src2));
 6129   effect(TEMP xtmp1, TEMP xtmp2);
 6130   ins_cost(500);
 6131   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6132   ins_encode %{
 6133     int vlen_enc = vector_length_encoding(this);
 6134     // Get the lo-hi products, only the lower 32 bits is in concerns
 6135     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6136     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6137     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6138     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6139     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6140     // Get the lo-lo products
 6141     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6142     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6143   %}
 6144   ins_pipe( pipe_slow );
 6145 %}
 6146 
 6147 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6148   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6149   match(Set dst (MulVL src1 src2));
 6150   ins_cost(100);
 6151   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6152   ins_encode %{
 6153     int vlen_enc = vector_length_encoding(this);
 6154     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6155   %}
 6156   ins_pipe( pipe_slow );
 6157 %}
 6158 
 6159 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6160   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6161   match(Set dst (MulVL src1 src2));
 6162   ins_cost(100);
 6163   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6164   ins_encode %{
 6165     int vlen_enc = vector_length_encoding(this);
 6166     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6167   %}
 6168   ins_pipe( pipe_slow );
 6169 %}
 6170 
 6171 // Floats vector mul
 6172 instruct vmulF(vec dst, vec src) %{
 6173   predicate(UseAVX == 0);
 6174   match(Set dst (MulVF dst src));
 6175   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6176   ins_encode %{
 6177     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6178   %}
 6179   ins_pipe( pipe_slow );
 6180 %}
 6181 
 6182 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6183   predicate(UseAVX > 0);
 6184   match(Set dst (MulVF src1 src2));
 6185   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6186   ins_encode %{
 6187     int vlen_enc = vector_length_encoding(this);
 6188     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6189   %}
 6190   ins_pipe( pipe_slow );
 6191 %}
 6192 
 6193 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6194   predicate((UseAVX > 0) &&
 6195             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6196   match(Set dst (MulVF src (LoadVector mem)));
 6197   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6198   ins_encode %{
 6199     int vlen_enc = vector_length_encoding(this);
 6200     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6201   %}
 6202   ins_pipe( pipe_slow );
 6203 %}
 6204 
 6205 // Doubles vector mul
 6206 instruct vmulD(vec dst, vec src) %{
 6207   predicate(UseAVX == 0);
 6208   match(Set dst (MulVD dst src));
 6209   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6210   ins_encode %{
 6211     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6212   %}
 6213   ins_pipe( pipe_slow );
 6214 %}
 6215 
 6216 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6217   predicate(UseAVX > 0);
 6218   match(Set dst (MulVD src1 src2));
 6219   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6220   ins_encode %{
 6221     int vlen_enc = vector_length_encoding(this);
 6222     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6223   %}
 6224   ins_pipe( pipe_slow );
 6225 %}
 6226 
 6227 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6228   predicate((UseAVX > 0) &&
 6229             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6230   match(Set dst (MulVD src (LoadVector mem)));
 6231   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6232   ins_encode %{
 6233     int vlen_enc = vector_length_encoding(this);
 6234     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6235   %}
 6236   ins_pipe( pipe_slow );
 6237 %}
 6238 
 6239 // --------------------------------- DIV --------------------------------------
 6240 
 6241 // Floats vector div
 6242 instruct vdivF(vec dst, vec src) %{
 6243   predicate(UseAVX == 0);
 6244   match(Set dst (DivVF dst src));
 6245   format %{ "divps   $dst,$src\t! div packedF" %}
 6246   ins_encode %{
 6247     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6248   %}
 6249   ins_pipe( pipe_slow );
 6250 %}
 6251 
 6252 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6253   predicate(UseAVX > 0);
 6254   match(Set dst (DivVF src1 src2));
 6255   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6256   ins_encode %{
 6257     int vlen_enc = vector_length_encoding(this);
 6258     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6259   %}
 6260   ins_pipe( pipe_slow );
 6261 %}
 6262 
 6263 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6264   predicate((UseAVX > 0) &&
 6265             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6266   match(Set dst (DivVF src (LoadVector mem)));
 6267   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6268   ins_encode %{
 6269     int vlen_enc = vector_length_encoding(this);
 6270     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6271   %}
 6272   ins_pipe( pipe_slow );
 6273 %}
 6274 
 6275 // Doubles vector div
 6276 instruct vdivD(vec dst, vec src) %{
 6277   predicate(UseAVX == 0);
 6278   match(Set dst (DivVD dst src));
 6279   format %{ "divpd   $dst,$src\t! div packedD" %}
 6280   ins_encode %{
 6281     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6282   %}
 6283   ins_pipe( pipe_slow );
 6284 %}
 6285 
 6286 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6287   predicate(UseAVX > 0);
 6288   match(Set dst (DivVD src1 src2));
 6289   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6290   ins_encode %{
 6291     int vlen_enc = vector_length_encoding(this);
 6292     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6293   %}
 6294   ins_pipe( pipe_slow );
 6295 %}
 6296 
 6297 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6298   predicate((UseAVX > 0) &&
 6299             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6300   match(Set dst (DivVD src (LoadVector mem)));
 6301   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6302   ins_encode %{
 6303     int vlen_enc = vector_length_encoding(this);
 6304     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6305   %}
 6306   ins_pipe( pipe_slow );
 6307 %}
 6308 
 6309 // ------------------------------ MinMax ---------------------------------------
 6310 
 6311 // Byte, Short, Int vector Min/Max
 6312 instruct minmax_reg_sse(vec dst, vec src) %{
 6313   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6314             UseAVX == 0);
 6315   match(Set dst (MinV dst src));
 6316   match(Set dst (MaxV dst src));
 6317   format %{ "vector_minmax  $dst,$src\t!  " %}
 6318   ins_encode %{
 6319     assert(UseSSE >= 4, "required");
 6320 
 6321     int opcode = this->ideal_Opcode();
 6322     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6323     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6324   %}
 6325   ins_pipe( pipe_slow );
 6326 %}
 6327 
 6328 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6329   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6330             UseAVX > 0);
 6331   match(Set dst (MinV src1 src2));
 6332   match(Set dst (MaxV src1 src2));
 6333   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6334   ins_encode %{
 6335     int opcode = this->ideal_Opcode();
 6336     int vlen_enc = vector_length_encoding(this);
 6337     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6338 
 6339     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6340   %}
 6341   ins_pipe( pipe_slow );
 6342 %}
 6343 
 6344 // Long vector Min/Max
 6345 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6346   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6347             UseAVX == 0);
 6348   match(Set dst (MinV dst src));
 6349   match(Set dst (MaxV src dst));
 6350   effect(TEMP dst, TEMP tmp);
 6351   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6352   ins_encode %{
 6353     assert(UseSSE >= 4, "required");
 6354 
 6355     int opcode = this->ideal_Opcode();
 6356     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6357     assert(elem_bt == T_LONG, "sanity");
 6358 
 6359     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6360   %}
 6361   ins_pipe( pipe_slow );
 6362 %}
 6363 
 6364 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6365   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6366             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6367   match(Set dst (MinV src1 src2));
 6368   match(Set dst (MaxV src1 src2));
 6369   effect(TEMP dst);
 6370   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6371   ins_encode %{
 6372     int vlen_enc = vector_length_encoding(this);
 6373     int opcode = this->ideal_Opcode();
 6374     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6375     assert(elem_bt == T_LONG, "sanity");
 6376 
 6377     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6378   %}
 6379   ins_pipe( pipe_slow );
 6380 %}
 6381 
 6382 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6383   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6384             Matcher::vector_element_basic_type(n) == T_LONG);
 6385   match(Set dst (MinV src1 src2));
 6386   match(Set dst (MaxV src1 src2));
 6387   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6388   ins_encode %{
 6389     assert(UseAVX > 2, "required");
 6390 
 6391     int vlen_enc = vector_length_encoding(this);
 6392     int opcode = this->ideal_Opcode();
 6393     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6394     assert(elem_bt == T_LONG, "sanity");
 6395 
 6396     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6397   %}
 6398   ins_pipe( pipe_slow );
 6399 %}
 6400 
 6401 // Float/Double vector Min/Max
 6402 instruct minmaxFP_avx10_reg(vec dst, vec a, vec b) %{
 6403   predicate(VM_Version::supports_avx10_2() &&
 6404             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6405   match(Set dst (MinV a b));
 6406   match(Set dst (MaxV a b));
 6407   format %{ "vector_minmaxFP  $dst, $a, $b" %}
 6408   ins_encode %{
 6409     int vlen_enc = vector_length_encoding(this);
 6410     int opcode = this->ideal_Opcode();
 6411     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6412     __ vminmax_fp(opcode, elem_bt, $dst$$XMMRegister, k0, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6413   %}
 6414   ins_pipe( pipe_slow );
 6415 %}
 6416 
 6417 // Float/Double vector Min/Max
 6418 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6419   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) <= 32 &&
 6420             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6421             UseAVX > 0);
 6422   match(Set dst (MinV a b));
 6423   match(Set dst (MaxV a b));
 6424   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6425   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6426   ins_encode %{
 6427     assert(UseAVX > 0, "required");
 6428 
 6429     int opcode = this->ideal_Opcode();
 6430     int vlen_enc = vector_length_encoding(this);
 6431     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6432 
 6433     __ vminmax_fp(opcode, elem_bt,
 6434                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6435                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6436   %}
 6437   ins_pipe( pipe_slow );
 6438 %}
 6439 
 6440 instruct evminmaxFP_reg_evex(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6441   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) == 64 &&
 6442             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6443   match(Set dst (MinV a b));
 6444   match(Set dst (MaxV a b));
 6445   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6446   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6447   ins_encode %{
 6448     assert(UseAVX > 2, "required");
 6449 
 6450     int opcode = this->ideal_Opcode();
 6451     int vlen_enc = vector_length_encoding(this);
 6452     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6453 
 6454     __ evminmax_fp(opcode, elem_bt,
 6455                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6456                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6457   %}
 6458   ins_pipe( pipe_slow );
 6459 %}
 6460 
 6461 // ------------------------------ Unsigned vector Min/Max ----------------------
 6462 
 6463 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6464   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6465   match(Set dst (UMinV a b));
 6466   match(Set dst (UMaxV a b));
 6467   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6468   ins_encode %{
 6469     int opcode = this->ideal_Opcode();
 6470     int vlen_enc = vector_length_encoding(this);
 6471     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6472     assert(is_integral_type(elem_bt), "");
 6473     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6474   %}
 6475   ins_pipe( pipe_slow );
 6476 %}
 6477 
 6478 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6479   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6480   match(Set dst (UMinV a (LoadVector b)));
 6481   match(Set dst (UMaxV a (LoadVector b)));
 6482   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6483   ins_encode %{
 6484     int opcode = this->ideal_Opcode();
 6485     int vlen_enc = vector_length_encoding(this);
 6486     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6487     assert(is_integral_type(elem_bt), "");
 6488     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6489   %}
 6490   ins_pipe( pipe_slow );
 6491 %}
 6492 
 6493 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6494   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6495   match(Set dst (UMinV a b));
 6496   match(Set dst (UMaxV a b));
 6497   effect(TEMP xtmp1, TEMP xtmp2);
 6498   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6499   ins_encode %{
 6500     int opcode = this->ideal_Opcode();
 6501     int vlen_enc = vector_length_encoding(this);
 6502     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6503   %}
 6504   ins_pipe( pipe_slow );
 6505 %}
 6506 
 6507 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6508   match(Set dst (UMinV (Binary dst src2) mask));
 6509   match(Set dst (UMaxV (Binary dst src2) mask));
 6510   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6511   ins_encode %{
 6512     int vlen_enc = vector_length_encoding(this);
 6513     BasicType bt = Matcher::vector_element_basic_type(this);
 6514     int opc = this->ideal_Opcode();
 6515     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6516                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6517   %}
 6518   ins_pipe( pipe_slow );
 6519 %}
 6520 
 6521 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6522   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6523   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6524   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6525   ins_encode %{
 6526     int vlen_enc = vector_length_encoding(this);
 6527     BasicType bt = Matcher::vector_element_basic_type(this);
 6528     int opc = this->ideal_Opcode();
 6529     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6530                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6531   %}
 6532   ins_pipe( pipe_slow );
 6533 %}
 6534 
 6535 // --------------------------------- Signum/CopySign ---------------------------
 6536 
 6537 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6538   match(Set dst (SignumF dst (Binary zero one)));
 6539   effect(KILL cr);
 6540   format %{ "signumF $dst, $dst" %}
 6541   ins_encode %{
 6542     int opcode = this->ideal_Opcode();
 6543     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6544   %}
 6545   ins_pipe( pipe_slow );
 6546 %}
 6547 
 6548 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6549   match(Set dst (SignumD dst (Binary zero one)));
 6550   effect(KILL cr);
 6551   format %{ "signumD $dst, $dst" %}
 6552   ins_encode %{
 6553     int opcode = this->ideal_Opcode();
 6554     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6555   %}
 6556   ins_pipe( pipe_slow );
 6557 %}
 6558 
 6559 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6560   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6561   match(Set dst (SignumVF src (Binary zero one)));
 6562   match(Set dst (SignumVD src (Binary zero one)));
 6563   effect(TEMP dst, TEMP xtmp1);
 6564   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6565   ins_encode %{
 6566     int opcode = this->ideal_Opcode();
 6567     int vec_enc = vector_length_encoding(this);
 6568     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6569                          $xtmp1$$XMMRegister, vec_enc);
 6570   %}
 6571   ins_pipe( pipe_slow );
 6572 %}
 6573 
 6574 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6575   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6576   match(Set dst (SignumVF src (Binary zero one)));
 6577   match(Set dst (SignumVD src (Binary zero one)));
 6578   effect(TEMP dst, TEMP ktmp1);
 6579   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6580   ins_encode %{
 6581     int opcode = this->ideal_Opcode();
 6582     int vec_enc = vector_length_encoding(this);
 6583     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6584                           $ktmp1$$KRegister, vec_enc);
 6585   %}
 6586   ins_pipe( pipe_slow );
 6587 %}
 6588 
 6589 // ---------------------------------------
 6590 // For copySign use 0xE4 as writemask for vpternlog
 6591 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6592 // C (xmm2) is set to 0x7FFFFFFF
 6593 // Wherever xmm2 is 0, we want to pick from B (sign)
 6594 // Wherever xmm2 is 1, we want to pick from A (src)
 6595 //
 6596 // A B C Result
 6597 // 0 0 0 0
 6598 // 0 0 1 0
 6599 // 0 1 0 1
 6600 // 0 1 1 0
 6601 // 1 0 0 0
 6602 // 1 0 1 1
 6603 // 1 1 0 1
 6604 // 1 1 1 1
 6605 //
 6606 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6607 // ---------------------------------------
 6608 
 6609 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6610   match(Set dst (CopySignF dst src));
 6611   effect(TEMP tmp1, TEMP tmp2);
 6612   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6613   ins_encode %{
 6614     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6615     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6616     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6617   %}
 6618   ins_pipe( pipe_slow );
 6619 %}
 6620 
 6621 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6622   match(Set dst (CopySignD dst (Binary src zero)));
 6623   ins_cost(100);
 6624   effect(TEMP tmp1, TEMP tmp2);
 6625   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6626   ins_encode %{
 6627     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6628     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6629     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6630   %}
 6631   ins_pipe( pipe_slow );
 6632 %}
 6633 
 6634 //----------------------------- CompressBits/ExpandBits ------------------------
 6635 
 6636 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6637   predicate(n->bottom_type()->isa_int());
 6638   match(Set dst (CompressBits src mask));
 6639   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6640   ins_encode %{
 6641     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6642   %}
 6643   ins_pipe( pipe_slow );
 6644 %}
 6645 
 6646 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6647   predicate(n->bottom_type()->isa_int());
 6648   match(Set dst (ExpandBits src mask));
 6649   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6650   ins_encode %{
 6651     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6652   %}
 6653   ins_pipe( pipe_slow );
 6654 %}
 6655 
 6656 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6657   predicate(n->bottom_type()->isa_int());
 6658   match(Set dst (CompressBits src (LoadI mask)));
 6659   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6660   ins_encode %{
 6661     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6662   %}
 6663   ins_pipe( pipe_slow );
 6664 %}
 6665 
 6666 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6667   predicate(n->bottom_type()->isa_int());
 6668   match(Set dst (ExpandBits src (LoadI mask)));
 6669   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6670   ins_encode %{
 6671     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6672   %}
 6673   ins_pipe( pipe_slow );
 6674 %}
 6675 
 6676 // --------------------------------- Sqrt --------------------------------------
 6677 
 6678 instruct vsqrtF_reg(vec dst, vec src) %{
 6679   match(Set dst (SqrtVF src));
 6680   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6681   ins_encode %{
 6682     assert(UseAVX > 0, "required");
 6683     int vlen_enc = vector_length_encoding(this);
 6684     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6685   %}
 6686   ins_pipe( pipe_slow );
 6687 %}
 6688 
 6689 instruct vsqrtF_mem(vec dst, memory mem) %{
 6690   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6691   match(Set dst (SqrtVF (LoadVector mem)));
 6692   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6693   ins_encode %{
 6694     assert(UseAVX > 0, "required");
 6695     int vlen_enc = vector_length_encoding(this);
 6696     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6697   %}
 6698   ins_pipe( pipe_slow );
 6699 %}
 6700 
 6701 // Floating point vector sqrt
 6702 instruct vsqrtD_reg(vec dst, vec src) %{
 6703   match(Set dst (SqrtVD src));
 6704   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6705   ins_encode %{
 6706     assert(UseAVX > 0, "required");
 6707     int vlen_enc = vector_length_encoding(this);
 6708     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6709   %}
 6710   ins_pipe( pipe_slow );
 6711 %}
 6712 
 6713 instruct vsqrtD_mem(vec dst, memory mem) %{
 6714   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6715   match(Set dst (SqrtVD (LoadVector mem)));
 6716   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6717   ins_encode %{
 6718     assert(UseAVX > 0, "required");
 6719     int vlen_enc = vector_length_encoding(this);
 6720     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6721   %}
 6722   ins_pipe( pipe_slow );
 6723 %}
 6724 
 6725 // ------------------------------ Shift ---------------------------------------
 6726 
 6727 // Left and right shift count vectors are the same on x86
 6728 // (only lowest bits of xmm reg are used for count).
 6729 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6730   match(Set dst (LShiftCntV cnt));
 6731   match(Set dst (RShiftCntV cnt));
 6732   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6733   ins_encode %{
 6734     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6735   %}
 6736   ins_pipe( pipe_slow );
 6737 %}
 6738 
 6739 // Byte vector shift
 6740 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6741   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6742   match(Set dst ( LShiftVB src shift));
 6743   match(Set dst ( RShiftVB src shift));
 6744   match(Set dst (URShiftVB src shift));
 6745   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6746   format %{"vector_byte_shift $dst,$src,$shift" %}
 6747   ins_encode %{
 6748     assert(UseSSE > 3, "required");
 6749     int opcode = this->ideal_Opcode();
 6750     bool sign = (opcode != Op_URShiftVB);
 6751     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6752     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6753     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6754     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6755     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6756   %}
 6757   ins_pipe( pipe_slow );
 6758 %}
 6759 
 6760 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6761   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6762             UseAVX <= 1);
 6763   match(Set dst ( LShiftVB src shift));
 6764   match(Set dst ( RShiftVB src shift));
 6765   match(Set dst (URShiftVB src shift));
 6766   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6767   format %{"vector_byte_shift $dst,$src,$shift" %}
 6768   ins_encode %{
 6769     assert(UseSSE > 3, "required");
 6770     int opcode = this->ideal_Opcode();
 6771     bool sign = (opcode != Op_URShiftVB);
 6772     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6773     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6774     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6775     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6776     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6777     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6778     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6779     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6780     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6781   %}
 6782   ins_pipe( pipe_slow );
 6783 %}
 6784 
 6785 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6786   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6787             UseAVX > 1);
 6788   match(Set dst ( LShiftVB src shift));
 6789   match(Set dst ( RShiftVB src shift));
 6790   match(Set dst (URShiftVB src shift));
 6791   effect(TEMP dst, TEMP tmp);
 6792   format %{"vector_byte_shift $dst,$src,$shift" %}
 6793   ins_encode %{
 6794     int opcode = this->ideal_Opcode();
 6795     bool sign = (opcode != Op_URShiftVB);
 6796     int vlen_enc = Assembler::AVX_256bit;
 6797     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6798     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6799     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6800     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6801     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6802   %}
 6803   ins_pipe( pipe_slow );
 6804 %}
 6805 
 6806 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6807   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6808   match(Set dst ( LShiftVB src shift));
 6809   match(Set dst ( RShiftVB src shift));
 6810   match(Set dst (URShiftVB src shift));
 6811   effect(TEMP dst, TEMP tmp);
 6812   format %{"vector_byte_shift $dst,$src,$shift" %}
 6813   ins_encode %{
 6814     assert(UseAVX > 1, "required");
 6815     int opcode = this->ideal_Opcode();
 6816     bool sign = (opcode != Op_URShiftVB);
 6817     int vlen_enc = Assembler::AVX_256bit;
 6818     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6819     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6820     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6821     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6822     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6823     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6824     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6825     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6826     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6827   %}
 6828   ins_pipe( pipe_slow );
 6829 %}
 6830 
 6831 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6832   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6833   match(Set dst ( LShiftVB src shift));
 6834   match(Set dst  (RShiftVB src shift));
 6835   match(Set dst (URShiftVB src shift));
 6836   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6837   format %{"vector_byte_shift $dst,$src,$shift" %}
 6838   ins_encode %{
 6839     assert(UseAVX > 2, "required");
 6840     int opcode = this->ideal_Opcode();
 6841     bool sign = (opcode != Op_URShiftVB);
 6842     int vlen_enc = Assembler::AVX_512bit;
 6843     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6844     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6845     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6846     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6847     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6848     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6849     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6850     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6851     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6852     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6853     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6854     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6855   %}
 6856   ins_pipe( pipe_slow );
 6857 %}
 6858 
 6859 // Shorts vector logical right shift produces incorrect Java result
 6860 // for negative data because java code convert short value into int with
 6861 // sign extension before a shift. But char vectors are fine since chars are
 6862 // unsigned values.
 6863 // Shorts/Chars vector left shift
 6864 instruct vshiftS(vec dst, vec src, vec shift) %{
 6865   predicate(!n->as_ShiftV()->is_var_shift());
 6866   match(Set dst ( LShiftVS src shift));
 6867   match(Set dst ( RShiftVS src shift));
 6868   match(Set dst (URShiftVS src shift));
 6869   effect(TEMP dst, USE src, USE shift);
 6870   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6871   ins_encode %{
 6872     int opcode = this->ideal_Opcode();
 6873     if (UseAVX > 0) {
 6874       int vlen_enc = vector_length_encoding(this);
 6875       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6876     } else {
 6877       int vlen = Matcher::vector_length(this);
 6878       if (vlen == 2) {
 6879         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6880         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6881       } else if (vlen == 4) {
 6882         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6883         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6884       } else {
 6885         assert (vlen == 8, "sanity");
 6886         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6887         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6888       }
 6889     }
 6890   %}
 6891   ins_pipe( pipe_slow );
 6892 %}
 6893 
 6894 // Integers vector left shift
 6895 instruct vshiftI(vec dst, vec src, vec shift) %{
 6896   predicate(!n->as_ShiftV()->is_var_shift());
 6897   match(Set dst ( LShiftVI src shift));
 6898   match(Set dst ( RShiftVI src shift));
 6899   match(Set dst (URShiftVI src shift));
 6900   effect(TEMP dst, USE src, USE shift);
 6901   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6902   ins_encode %{
 6903     int opcode = this->ideal_Opcode();
 6904     if (UseAVX > 0) {
 6905       int vlen_enc = vector_length_encoding(this);
 6906       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6907     } else {
 6908       int vlen = Matcher::vector_length(this);
 6909       if (vlen == 2) {
 6910         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6911         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6912       } else {
 6913         assert(vlen == 4, "sanity");
 6914         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6915         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6916       }
 6917     }
 6918   %}
 6919   ins_pipe( pipe_slow );
 6920 %}
 6921 
 6922 // Integers vector left constant shift
 6923 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6924   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6925   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6926   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6927   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6928   ins_encode %{
 6929     int opcode = this->ideal_Opcode();
 6930     if (UseAVX > 0) {
 6931       int vector_len = vector_length_encoding(this);
 6932       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6933     } else {
 6934       int vlen = Matcher::vector_length(this);
 6935       if (vlen == 2) {
 6936         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6937         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6938       } else {
 6939         assert(vlen == 4, "sanity");
 6940         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6941         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6942       }
 6943     }
 6944   %}
 6945   ins_pipe( pipe_slow );
 6946 %}
 6947 
 6948 // Longs vector shift
 6949 instruct vshiftL(vec dst, vec src, vec shift) %{
 6950   predicate(!n->as_ShiftV()->is_var_shift());
 6951   match(Set dst ( LShiftVL src shift));
 6952   match(Set dst (URShiftVL src shift));
 6953   effect(TEMP dst, USE src, USE shift);
 6954   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6955   ins_encode %{
 6956     int opcode = this->ideal_Opcode();
 6957     if (UseAVX > 0) {
 6958       int vlen_enc = vector_length_encoding(this);
 6959       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6960     } else {
 6961       assert(Matcher::vector_length(this) == 2, "");
 6962       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6963       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6964     }
 6965   %}
 6966   ins_pipe( pipe_slow );
 6967 %}
 6968 
 6969 // Longs vector constant shift
 6970 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6971   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6972   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6973   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6974   ins_encode %{
 6975     int opcode = this->ideal_Opcode();
 6976     if (UseAVX > 0) {
 6977       int vector_len = vector_length_encoding(this);
 6978       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6979     } else {
 6980       assert(Matcher::vector_length(this) == 2, "");
 6981       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6982       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6983     }
 6984   %}
 6985   ins_pipe( pipe_slow );
 6986 %}
 6987 
 6988 // -------------------ArithmeticRightShift -----------------------------------
 6989 // Long vector arithmetic right shift
 6990 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6991   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6992   match(Set dst (RShiftVL src shift));
 6993   effect(TEMP dst, TEMP tmp);
 6994   format %{ "vshiftq $dst,$src,$shift" %}
 6995   ins_encode %{
 6996     uint vlen = Matcher::vector_length(this);
 6997     if (vlen == 2) {
 6998       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6999       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7000       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7001       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7002       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7003       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7004     } else {
 7005       assert(vlen == 4, "sanity");
 7006       assert(UseAVX > 1, "required");
 7007       int vlen_enc = Assembler::AVX_256bit;
 7008       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7009       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7010       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7011       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7012       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7013     }
 7014   %}
 7015   ins_pipe( pipe_slow );
 7016 %}
 7017 
 7018 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7019   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7020   match(Set dst (RShiftVL src shift));
 7021   format %{ "vshiftq $dst,$src,$shift" %}
 7022   ins_encode %{
 7023     int vlen_enc = vector_length_encoding(this);
 7024     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7025   %}
 7026   ins_pipe( pipe_slow );
 7027 %}
 7028 
 7029 // ------------------- Variable Shift -----------------------------
 7030 // Byte variable shift
 7031 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7032   predicate(Matcher::vector_length(n) <= 8 &&
 7033             n->as_ShiftV()->is_var_shift() &&
 7034             !VM_Version::supports_avx512bw());
 7035   match(Set dst ( LShiftVB src shift));
 7036   match(Set dst ( RShiftVB src shift));
 7037   match(Set dst (URShiftVB src shift));
 7038   effect(TEMP dst, TEMP vtmp);
 7039   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7040   ins_encode %{
 7041     assert(UseAVX >= 2, "required");
 7042 
 7043     int opcode = this->ideal_Opcode();
 7044     int vlen_enc = Assembler::AVX_128bit;
 7045     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7046     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7047   %}
 7048   ins_pipe( pipe_slow );
 7049 %}
 7050 
 7051 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7052   predicate(Matcher::vector_length(n) == 16 &&
 7053             n->as_ShiftV()->is_var_shift() &&
 7054             !VM_Version::supports_avx512bw());
 7055   match(Set dst ( LShiftVB src shift));
 7056   match(Set dst ( RShiftVB src shift));
 7057   match(Set dst (URShiftVB src shift));
 7058   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7059   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7060   ins_encode %{
 7061     assert(UseAVX >= 2, "required");
 7062 
 7063     int opcode = this->ideal_Opcode();
 7064     int vlen_enc = Assembler::AVX_128bit;
 7065     // Shift lower half and get word result in dst
 7066     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7067 
 7068     // Shift upper half and get word result in vtmp1
 7069     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7070     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7071     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7072 
 7073     // Merge and down convert the two word results to byte in dst
 7074     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7075   %}
 7076   ins_pipe( pipe_slow );
 7077 %}
 7078 
 7079 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7080   predicate(Matcher::vector_length(n) == 32 &&
 7081             n->as_ShiftV()->is_var_shift() &&
 7082             !VM_Version::supports_avx512bw());
 7083   match(Set dst ( LShiftVB src shift));
 7084   match(Set dst ( RShiftVB src shift));
 7085   match(Set dst (URShiftVB src shift));
 7086   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7087   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7088   ins_encode %{
 7089     assert(UseAVX >= 2, "required");
 7090 
 7091     int opcode = this->ideal_Opcode();
 7092     int vlen_enc = Assembler::AVX_128bit;
 7093     // Process lower 128 bits and get result in dst
 7094     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7095     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7096     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7097     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7098     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7099 
 7100     // Process higher 128 bits and get result in vtmp3
 7101     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7102     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7103     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7104     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7105     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7106     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7107     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7108 
 7109     // Merge the two results in dst
 7110     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7111   %}
 7112   ins_pipe( pipe_slow );
 7113 %}
 7114 
 7115 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7116   predicate(Matcher::vector_length(n) <= 32 &&
 7117             n->as_ShiftV()->is_var_shift() &&
 7118             VM_Version::supports_avx512bw());
 7119   match(Set dst ( LShiftVB src shift));
 7120   match(Set dst ( RShiftVB src shift));
 7121   match(Set dst (URShiftVB src shift));
 7122   effect(TEMP dst, TEMP vtmp);
 7123   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7124   ins_encode %{
 7125     assert(UseAVX > 2, "required");
 7126 
 7127     int opcode = this->ideal_Opcode();
 7128     int vlen_enc = vector_length_encoding(this);
 7129     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7130   %}
 7131   ins_pipe( pipe_slow );
 7132 %}
 7133 
 7134 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7135   predicate(Matcher::vector_length(n) == 64 &&
 7136             n->as_ShiftV()->is_var_shift() &&
 7137             VM_Version::supports_avx512bw());
 7138   match(Set dst ( LShiftVB src shift));
 7139   match(Set dst ( RShiftVB src shift));
 7140   match(Set dst (URShiftVB src shift));
 7141   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7142   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7143   ins_encode %{
 7144     assert(UseAVX > 2, "required");
 7145 
 7146     int opcode = this->ideal_Opcode();
 7147     int vlen_enc = Assembler::AVX_256bit;
 7148     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7149     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7150     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7151     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7152     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7153   %}
 7154   ins_pipe( pipe_slow );
 7155 %}
 7156 
 7157 // Short variable shift
 7158 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7159   predicate(Matcher::vector_length(n) <= 8 &&
 7160             n->as_ShiftV()->is_var_shift() &&
 7161             !VM_Version::supports_avx512bw());
 7162   match(Set dst ( LShiftVS src shift));
 7163   match(Set dst ( RShiftVS src shift));
 7164   match(Set dst (URShiftVS src shift));
 7165   effect(TEMP dst, TEMP vtmp);
 7166   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7167   ins_encode %{
 7168     assert(UseAVX >= 2, "required");
 7169 
 7170     int opcode = this->ideal_Opcode();
 7171     bool sign = (opcode != Op_URShiftVS);
 7172     int vlen_enc = Assembler::AVX_256bit;
 7173     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7174     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7175     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7176     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7177     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7178     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7179   %}
 7180   ins_pipe( pipe_slow );
 7181 %}
 7182 
 7183 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7184   predicate(Matcher::vector_length(n) == 16 &&
 7185             n->as_ShiftV()->is_var_shift() &&
 7186             !VM_Version::supports_avx512bw());
 7187   match(Set dst ( LShiftVS src shift));
 7188   match(Set dst ( RShiftVS src shift));
 7189   match(Set dst (URShiftVS src shift));
 7190   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7191   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7192   ins_encode %{
 7193     assert(UseAVX >= 2, "required");
 7194 
 7195     int opcode = this->ideal_Opcode();
 7196     bool sign = (opcode != Op_URShiftVS);
 7197     int vlen_enc = Assembler::AVX_256bit;
 7198     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7199     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7200     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7201     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7202     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7203 
 7204     // Shift upper half, with result in dst using vtmp1 as TEMP
 7205     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7206     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7207     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7208     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7209     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7210     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7211 
 7212     // Merge lower and upper half result into dst
 7213     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7214     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7215   %}
 7216   ins_pipe( pipe_slow );
 7217 %}
 7218 
 7219 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7220   predicate(n->as_ShiftV()->is_var_shift() &&
 7221             VM_Version::supports_avx512bw());
 7222   match(Set dst ( LShiftVS src shift));
 7223   match(Set dst ( RShiftVS src shift));
 7224   match(Set dst (URShiftVS src shift));
 7225   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7226   ins_encode %{
 7227     assert(UseAVX > 2, "required");
 7228 
 7229     int opcode = this->ideal_Opcode();
 7230     int vlen_enc = vector_length_encoding(this);
 7231     if (!VM_Version::supports_avx512vl()) {
 7232       vlen_enc = Assembler::AVX_512bit;
 7233     }
 7234     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7235   %}
 7236   ins_pipe( pipe_slow );
 7237 %}
 7238 
 7239 //Integer variable shift
 7240 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7241   predicate(n->as_ShiftV()->is_var_shift());
 7242   match(Set dst ( LShiftVI src shift));
 7243   match(Set dst ( RShiftVI src shift));
 7244   match(Set dst (URShiftVI src shift));
 7245   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7246   ins_encode %{
 7247     assert(UseAVX >= 2, "required");
 7248 
 7249     int opcode = this->ideal_Opcode();
 7250     int vlen_enc = vector_length_encoding(this);
 7251     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7252   %}
 7253   ins_pipe( pipe_slow );
 7254 %}
 7255 
 7256 //Long variable shift
 7257 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7258   predicate(n->as_ShiftV()->is_var_shift());
 7259   match(Set dst ( LShiftVL src shift));
 7260   match(Set dst (URShiftVL src shift));
 7261   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7262   ins_encode %{
 7263     assert(UseAVX >= 2, "required");
 7264 
 7265     int opcode = this->ideal_Opcode();
 7266     int vlen_enc = vector_length_encoding(this);
 7267     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7268   %}
 7269   ins_pipe( pipe_slow );
 7270 %}
 7271 
 7272 //Long variable right shift arithmetic
 7273 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7274   predicate(Matcher::vector_length(n) <= 4 &&
 7275             n->as_ShiftV()->is_var_shift() &&
 7276             UseAVX == 2);
 7277   match(Set dst (RShiftVL src shift));
 7278   effect(TEMP dst, TEMP vtmp);
 7279   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7280   ins_encode %{
 7281     int opcode = this->ideal_Opcode();
 7282     int vlen_enc = vector_length_encoding(this);
 7283     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7284                  $vtmp$$XMMRegister);
 7285   %}
 7286   ins_pipe( pipe_slow );
 7287 %}
 7288 
 7289 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7290   predicate(n->as_ShiftV()->is_var_shift() &&
 7291             UseAVX > 2);
 7292   match(Set dst (RShiftVL src shift));
 7293   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7294   ins_encode %{
 7295     int opcode = this->ideal_Opcode();
 7296     int vlen_enc = vector_length_encoding(this);
 7297     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7298   %}
 7299   ins_pipe( pipe_slow );
 7300 %}
 7301 
 7302 // --------------------------------- AND --------------------------------------
 7303 
 7304 instruct vand(vec dst, vec src) %{
 7305   predicate(UseAVX == 0);
 7306   match(Set dst (AndV dst src));
 7307   format %{ "pand    $dst,$src\t! and vectors" %}
 7308   ins_encode %{
 7309     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7310   %}
 7311   ins_pipe( pipe_slow );
 7312 %}
 7313 
 7314 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7315   predicate(UseAVX > 0);
 7316   match(Set dst (AndV src1 src2));
 7317   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7318   ins_encode %{
 7319     int vlen_enc = vector_length_encoding(this);
 7320     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7321   %}
 7322   ins_pipe( pipe_slow );
 7323 %}
 7324 
 7325 instruct vand_mem(vec dst, vec src, memory mem) %{
 7326   predicate((UseAVX > 0) &&
 7327             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7328   match(Set dst (AndV src (LoadVector mem)));
 7329   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7330   ins_encode %{
 7331     int vlen_enc = vector_length_encoding(this);
 7332     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7333   %}
 7334   ins_pipe( pipe_slow );
 7335 %}
 7336 
 7337 // --------------------------------- OR ---------------------------------------
 7338 
 7339 instruct vor(vec dst, vec src) %{
 7340   predicate(UseAVX == 0);
 7341   match(Set dst (OrV dst src));
 7342   format %{ "por     $dst,$src\t! or vectors" %}
 7343   ins_encode %{
 7344     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7345   %}
 7346   ins_pipe( pipe_slow );
 7347 %}
 7348 
 7349 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7350   predicate(UseAVX > 0);
 7351   match(Set dst (OrV src1 src2));
 7352   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7353   ins_encode %{
 7354     int vlen_enc = vector_length_encoding(this);
 7355     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7356   %}
 7357   ins_pipe( pipe_slow );
 7358 %}
 7359 
 7360 instruct vor_mem(vec dst, vec src, memory mem) %{
 7361   predicate((UseAVX > 0) &&
 7362             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7363   match(Set dst (OrV src (LoadVector mem)));
 7364   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7365   ins_encode %{
 7366     int vlen_enc = vector_length_encoding(this);
 7367     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7368   %}
 7369   ins_pipe( pipe_slow );
 7370 %}
 7371 
 7372 // --------------------------------- XOR --------------------------------------
 7373 
 7374 instruct vxor(vec dst, vec src) %{
 7375   predicate(UseAVX == 0);
 7376   match(Set dst (XorV dst src));
 7377   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7378   ins_encode %{
 7379     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7380   %}
 7381   ins_pipe( pipe_slow );
 7382 %}
 7383 
 7384 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7385   predicate(UseAVX > 0);
 7386   match(Set dst (XorV src1 src2));
 7387   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7388   ins_encode %{
 7389     int vlen_enc = vector_length_encoding(this);
 7390     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7391   %}
 7392   ins_pipe( pipe_slow );
 7393 %}
 7394 
 7395 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7396   predicate((UseAVX > 0) &&
 7397             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7398   match(Set dst (XorV src (LoadVector mem)));
 7399   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7400   ins_encode %{
 7401     int vlen_enc = vector_length_encoding(this);
 7402     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7403   %}
 7404   ins_pipe( pipe_slow );
 7405 %}
 7406 
 7407 // --------------------------------- VectorCast --------------------------------------
 7408 
 7409 instruct vcastBtoX(vec dst, vec src) %{
 7410   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7411   match(Set dst (VectorCastB2X src));
 7412   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7413   ins_encode %{
 7414     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7415     int vlen_enc = vector_length_encoding(this);
 7416     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7417   %}
 7418   ins_pipe( pipe_slow );
 7419 %}
 7420 
 7421 instruct vcastBtoD(legVec dst, legVec src) %{
 7422   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7423   match(Set dst (VectorCastB2X src));
 7424   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7425   ins_encode %{
 7426     int vlen_enc = vector_length_encoding(this);
 7427     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7428   %}
 7429   ins_pipe( pipe_slow );
 7430 %}
 7431 
 7432 instruct castStoX(vec dst, vec src) %{
 7433   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7434             Matcher::vector_length(n->in(1)) <= 8 && // src
 7435             Matcher::vector_element_basic_type(n) == T_BYTE);
 7436   match(Set dst (VectorCastS2X src));
 7437   format %{ "vector_cast_s2x $dst,$src" %}
 7438   ins_encode %{
 7439     assert(UseAVX > 0, "required");
 7440 
 7441     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7442     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7443   %}
 7444   ins_pipe( pipe_slow );
 7445 %}
 7446 
 7447 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7448   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7449             Matcher::vector_length(n->in(1)) == 16 && // src
 7450             Matcher::vector_element_basic_type(n) == T_BYTE);
 7451   effect(TEMP dst, TEMP vtmp);
 7452   match(Set dst (VectorCastS2X src));
 7453   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7454   ins_encode %{
 7455     assert(UseAVX > 0, "required");
 7456 
 7457     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7458     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7459     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7460     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7461   %}
 7462   ins_pipe( pipe_slow );
 7463 %}
 7464 
 7465 instruct vcastStoX_evex(vec dst, vec src) %{
 7466   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7467             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7468   match(Set dst (VectorCastS2X src));
 7469   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7470   ins_encode %{
 7471     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7472     int src_vlen_enc = vector_length_encoding(this, $src);
 7473     int vlen_enc = vector_length_encoding(this);
 7474     switch (to_elem_bt) {
 7475       case T_BYTE:
 7476         if (!VM_Version::supports_avx512vl()) {
 7477           vlen_enc = Assembler::AVX_512bit;
 7478         }
 7479         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7480         break;
 7481       case T_INT:
 7482         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7483         break;
 7484       case T_FLOAT:
 7485         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7486         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7487         break;
 7488       case T_LONG:
 7489         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7490         break;
 7491       case T_DOUBLE: {
 7492         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7493         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7494         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7495         break;
 7496       }
 7497       default:
 7498         ShouldNotReachHere();
 7499     }
 7500   %}
 7501   ins_pipe( pipe_slow );
 7502 %}
 7503 
 7504 instruct castItoX(vec dst, vec src) %{
 7505   predicate(UseAVX <= 2 &&
 7506             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7507             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7508   match(Set dst (VectorCastI2X src));
 7509   format %{ "vector_cast_i2x $dst,$src" %}
 7510   ins_encode %{
 7511     assert(UseAVX > 0, "required");
 7512 
 7513     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7514     int vlen_enc = vector_length_encoding(this, $src);
 7515 
 7516     if (to_elem_bt == T_BYTE) {
 7517       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7518       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7519       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7520     } else {
 7521       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7522       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7523       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7524     }
 7525   %}
 7526   ins_pipe( pipe_slow );
 7527 %}
 7528 
 7529 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7530   predicate(UseAVX <= 2 &&
 7531             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7532             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7533   match(Set dst (VectorCastI2X src));
 7534   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7535   effect(TEMP dst, TEMP vtmp);
 7536   ins_encode %{
 7537     assert(UseAVX > 0, "required");
 7538 
 7539     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7540     int vlen_enc = vector_length_encoding(this, $src);
 7541 
 7542     if (to_elem_bt == T_BYTE) {
 7543       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7544       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7545       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7546       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7547     } else {
 7548       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7549       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7550       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7551       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7552     }
 7553   %}
 7554   ins_pipe( pipe_slow );
 7555 %}
 7556 
 7557 instruct vcastItoX_evex(vec dst, vec src) %{
 7558   predicate(UseAVX > 2 ||
 7559             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7560   match(Set dst (VectorCastI2X src));
 7561   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7562   ins_encode %{
 7563     assert(UseAVX > 0, "required");
 7564 
 7565     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7566     int src_vlen_enc = vector_length_encoding(this, $src);
 7567     int dst_vlen_enc = vector_length_encoding(this);
 7568     switch (dst_elem_bt) {
 7569       case T_BYTE:
 7570         if (!VM_Version::supports_avx512vl()) {
 7571           src_vlen_enc = Assembler::AVX_512bit;
 7572         }
 7573         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7574         break;
 7575       case T_SHORT:
 7576         if (!VM_Version::supports_avx512vl()) {
 7577           src_vlen_enc = Assembler::AVX_512bit;
 7578         }
 7579         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7580         break;
 7581       case T_FLOAT:
 7582         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7583         break;
 7584       case T_LONG:
 7585         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7586         break;
 7587       case T_DOUBLE:
 7588         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7589         break;
 7590       default:
 7591         ShouldNotReachHere();
 7592     }
 7593   %}
 7594   ins_pipe( pipe_slow );
 7595 %}
 7596 
 7597 instruct vcastLtoBS(vec dst, vec src) %{
 7598   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7599             UseAVX <= 2);
 7600   match(Set dst (VectorCastL2X src));
 7601   format %{ "vector_cast_l2x  $dst,$src" %}
 7602   ins_encode %{
 7603     assert(UseAVX > 0, "required");
 7604 
 7605     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7606     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7607     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7608                                                       : ExternalAddress(vector_int_to_short_mask());
 7609     if (vlen <= 16) {
 7610       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7611       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7612       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7613     } else {
 7614       assert(vlen <= 32, "required");
 7615       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7616       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7617       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7618       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7619     }
 7620     if (to_elem_bt == T_BYTE) {
 7621       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7622     }
 7623   %}
 7624   ins_pipe( pipe_slow );
 7625 %}
 7626 
 7627 instruct vcastLtoX_evex(vec dst, vec src) %{
 7628   predicate(UseAVX > 2 ||
 7629             (Matcher::vector_element_basic_type(n) == T_INT ||
 7630              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7631              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7632   match(Set dst (VectorCastL2X src));
 7633   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7634   ins_encode %{
 7635     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7636     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7637     int vlen_enc = vector_length_encoding(this, $src);
 7638     switch (to_elem_bt) {
 7639       case T_BYTE:
 7640         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7641           vlen_enc = Assembler::AVX_512bit;
 7642         }
 7643         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7644         break;
 7645       case T_SHORT:
 7646         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7647           vlen_enc = Assembler::AVX_512bit;
 7648         }
 7649         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7650         break;
 7651       case T_INT:
 7652         if (vlen == 8) {
 7653           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7654             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7655           }
 7656         } else if (vlen == 16) {
 7657           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7658         } else if (vlen == 32) {
 7659           if (UseAVX > 2) {
 7660             if (!VM_Version::supports_avx512vl()) {
 7661               vlen_enc = Assembler::AVX_512bit;
 7662             }
 7663             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7664           } else {
 7665             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7666             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7667           }
 7668         } else { // vlen == 64
 7669           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7670         }
 7671         break;
 7672       case T_FLOAT:
 7673         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7674         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7675         break;
 7676       case T_DOUBLE:
 7677         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7678         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7679         break;
 7680 
 7681       default: assert(false, "%s", type2name(to_elem_bt));
 7682     }
 7683   %}
 7684   ins_pipe( pipe_slow );
 7685 %}
 7686 
 7687 instruct vcastFtoD_reg(vec dst, vec src) %{
 7688   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7689   match(Set dst (VectorCastF2X src));
 7690   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7691   ins_encode %{
 7692     int vlen_enc = vector_length_encoding(this);
 7693     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7694   %}
 7695   ins_pipe( pipe_slow );
 7696 %}
 7697 
 7698 
 7699 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7700   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7701             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7702   match(Set dst (VectorCastF2X src));
 7703   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7704   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7705   ins_encode %{
 7706     int vlen_enc = vector_length_encoding(this, $src);
 7707     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7708     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7709     // 32 bit addresses for register indirect addressing mode since stub constants
 7710     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7711     // However, targets are free to increase this limit, but having a large code cache size
 7712     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7713     // cap we save a temporary register allocation which in limiting case can prevent
 7714     // spilling in high register pressure blocks.
 7715     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7716                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7717                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7718   %}
 7719   ins_pipe( pipe_slow );
 7720 %}
 7721 
 7722 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7723   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7724             is_integral_type(Matcher::vector_element_basic_type(n)));
 7725   match(Set dst (VectorCastF2X src));
 7726   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7727   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7728   ins_encode %{
 7729     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7730     if (to_elem_bt == T_LONG) {
 7731       int vlen_enc = vector_length_encoding(this);
 7732       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7733                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7734                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7735     } else {
 7736       int vlen_enc = vector_length_encoding(this, $src);
 7737       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7738                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7739                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7740     }
 7741   %}
 7742   ins_pipe( pipe_slow );
 7743 %}
 7744 
 7745 instruct vcastDtoF_reg(vec dst, vec src) %{
 7746   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7747   match(Set dst (VectorCastD2X src));
 7748   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7749   ins_encode %{
 7750     int vlen_enc = vector_length_encoding(this, $src);
 7751     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7752   %}
 7753   ins_pipe( pipe_slow );
 7754 %}
 7755 
 7756 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7757   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7758             is_integral_type(Matcher::vector_element_basic_type(n)));
 7759   match(Set dst (VectorCastD2X src));
 7760   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7761   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7762   ins_encode %{
 7763     int vlen_enc = vector_length_encoding(this, $src);
 7764     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7765     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7766                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7767                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7768   %}
 7769   ins_pipe( pipe_slow );
 7770 %}
 7771 
 7772 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7773   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7774             is_integral_type(Matcher::vector_element_basic_type(n)));
 7775   match(Set dst (VectorCastD2X src));
 7776   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7777   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7778   ins_encode %{
 7779     int vlen_enc = vector_length_encoding(this, $src);
 7780     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7781     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7782                               ExternalAddress(vector_float_signflip());
 7783     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7784                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7785   %}
 7786   ins_pipe( pipe_slow );
 7787 %}
 7788 
 7789 instruct vucast(vec dst, vec src) %{
 7790   match(Set dst (VectorUCastB2X src));
 7791   match(Set dst (VectorUCastS2X src));
 7792   match(Set dst (VectorUCastI2X src));
 7793   format %{ "vector_ucast $dst,$src\t!" %}
 7794   ins_encode %{
 7795     assert(UseAVX > 0, "required");
 7796 
 7797     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7798     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7799     int vlen_enc = vector_length_encoding(this);
 7800     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7801   %}
 7802   ins_pipe( pipe_slow );
 7803 %}
 7804 
 7805 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7806   predicate(!VM_Version::supports_avx512vl() &&
 7807             Matcher::vector_length_in_bytes(n) < 64 &&
 7808             Matcher::vector_element_basic_type(n) == T_INT);
 7809   match(Set dst (RoundVF src));
 7810   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7811   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7812   ins_encode %{
 7813     int vlen_enc = vector_length_encoding(this);
 7814     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7815     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7816                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7817                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7818   %}
 7819   ins_pipe( pipe_slow );
 7820 %}
 7821 
 7822 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7823   predicate((VM_Version::supports_avx512vl() ||
 7824              Matcher::vector_length_in_bytes(n) == 64) &&
 7825              Matcher::vector_element_basic_type(n) == T_INT);
 7826   match(Set dst (RoundVF src));
 7827   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7828   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7829   ins_encode %{
 7830     int vlen_enc = vector_length_encoding(this);
 7831     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7832     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7833                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7834                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7835   %}
 7836   ins_pipe( pipe_slow );
 7837 %}
 7838 
 7839 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7840   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7841   match(Set dst (RoundVD src));
 7842   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7843   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7844   ins_encode %{
 7845     int vlen_enc = vector_length_encoding(this);
 7846     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7847     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7848                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7849                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7850   %}
 7851   ins_pipe( pipe_slow );
 7852 %}
 7853 
 7854 // --------------------------------- VectorMaskCmp --------------------------------------
 7855 
 7856 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7857   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7858             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7859             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7860             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7861   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7862   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7863   ins_encode %{
 7864     int vlen_enc = vector_length_encoding(this, $src1);
 7865     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7866     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7867       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7868     } else {
 7869       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7870     }
 7871   %}
 7872   ins_pipe( pipe_slow );
 7873 %}
 7874 
 7875 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7876   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7877             n->bottom_type()->isa_vectmask() == nullptr &&
 7878             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7879   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7880   effect(TEMP ktmp);
 7881   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7882   ins_encode %{
 7883     int vlen_enc = Assembler::AVX_512bit;
 7884     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7885     KRegister mask = k0; // The comparison itself is not being masked.
 7886     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7887       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7888       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7889     } else {
 7890       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7891       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7892     }
 7893   %}
 7894   ins_pipe( pipe_slow );
 7895 %}
 7896 
 7897 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7898   predicate(n->bottom_type()->isa_vectmask() &&
 7899             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7900   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7901   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7902   ins_encode %{
 7903     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7904     int vlen_enc = vector_length_encoding(this, $src1);
 7905     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7906     KRegister mask = k0; // The comparison itself is not being masked.
 7907     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7908       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7909     } else {
 7910       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7911     }
 7912   %}
 7913   ins_pipe( pipe_slow );
 7914 %}
 7915 
 7916 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7917   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7918             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7919             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7920             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7921             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7922             (n->in(2)->get_int() == BoolTest::eq ||
 7923              n->in(2)->get_int() == BoolTest::lt ||
 7924              n->in(2)->get_int() == BoolTest::gt)); // cond
 7925   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7926   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7927   ins_encode %{
 7928     int vlen_enc = vector_length_encoding(this, $src1);
 7929     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7930     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7931     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7932   %}
 7933   ins_pipe( pipe_slow );
 7934 %}
 7935 
 7936 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7937   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7938             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7939             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7940             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7941             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7942             (n->in(2)->get_int() == BoolTest::ne ||
 7943              n->in(2)->get_int() == BoolTest::le ||
 7944              n->in(2)->get_int() == BoolTest::ge)); // cond
 7945   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7946   effect(TEMP dst, TEMP xtmp);
 7947   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7948   ins_encode %{
 7949     int vlen_enc = vector_length_encoding(this, $src1);
 7950     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7951     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7952     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7953   %}
 7954   ins_pipe( pipe_slow );
 7955 %}
 7956 
 7957 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7958   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7959             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7960             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7961             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7962             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7963   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7964   effect(TEMP dst, TEMP xtmp);
 7965   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7966   ins_encode %{
 7967     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7968     int vlen_enc = vector_length_encoding(this, $src1);
 7969     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7970     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7971 
 7972     if (vlen_enc == Assembler::AVX_128bit) {
 7973       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7974     } else {
 7975       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7976     }
 7977     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7978     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7979     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7980   %}
 7981   ins_pipe( pipe_slow );
 7982 %}
 7983 
 7984 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7985   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7986              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7987              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7988   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7989   effect(TEMP ktmp);
 7990   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7991   ins_encode %{
 7992     assert(UseAVX > 2, "required");
 7993 
 7994     int vlen_enc = vector_length_encoding(this, $src1);
 7995     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7996     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7997     KRegister mask = k0; // The comparison itself is not being masked.
 7998     bool merge = false;
 7999     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8000 
 8001     switch (src1_elem_bt) {
 8002       case T_INT: {
 8003         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8004         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8005         break;
 8006       }
 8007       case T_LONG: {
 8008         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8009         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8010         break;
 8011       }
 8012       default: assert(false, "%s", type2name(src1_elem_bt));
 8013     }
 8014   %}
 8015   ins_pipe( pipe_slow );
 8016 %}
 8017 
 8018 
 8019 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8020   predicate(n->bottom_type()->isa_vectmask() &&
 8021             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8022   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8023   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8024   ins_encode %{
 8025     assert(UseAVX > 2, "required");
 8026     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8027 
 8028     int vlen_enc = vector_length_encoding(this, $src1);
 8029     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8030     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8031     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8032 
 8033     // Comparison i
 8034     switch (src1_elem_bt) {
 8035       case T_BYTE: {
 8036         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8037         break;
 8038       }
 8039       case T_SHORT: {
 8040         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8041         break;
 8042       }
 8043       case T_INT: {
 8044         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8045         break;
 8046       }
 8047       case T_LONG: {
 8048         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8049         break;
 8050       }
 8051       default: assert(false, "%s", type2name(src1_elem_bt));
 8052     }
 8053   %}
 8054   ins_pipe( pipe_slow );
 8055 %}
 8056 
 8057 // Extract
 8058 
 8059 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8060   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8061   match(Set dst (ExtractI src idx));
 8062   match(Set dst (ExtractS src idx));
 8063   match(Set dst (ExtractB src idx));
 8064   format %{ "extractI $dst,$src,$idx\t!" %}
 8065   ins_encode %{
 8066     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8067 
 8068     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8069     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8070   %}
 8071   ins_pipe( pipe_slow );
 8072 %}
 8073 
 8074 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8075   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8076             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8077   match(Set dst (ExtractI src idx));
 8078   match(Set dst (ExtractS src idx));
 8079   match(Set dst (ExtractB src idx));
 8080   effect(TEMP vtmp);
 8081   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8082   ins_encode %{
 8083     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8084 
 8085     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8086     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8087     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8088   %}
 8089   ins_pipe( pipe_slow );
 8090 %}
 8091 
 8092 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8093   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8094   match(Set dst (ExtractL src idx));
 8095   format %{ "extractL $dst,$src,$idx\t!" %}
 8096   ins_encode %{
 8097     assert(UseSSE >= 4, "required");
 8098     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8099 
 8100     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8101   %}
 8102   ins_pipe( pipe_slow );
 8103 %}
 8104 
 8105 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8106   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8107             Matcher::vector_length(n->in(1)) == 8);  // src
 8108   match(Set dst (ExtractL src idx));
 8109   effect(TEMP vtmp);
 8110   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8111   ins_encode %{
 8112     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8113 
 8114     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8115     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8116   %}
 8117   ins_pipe( pipe_slow );
 8118 %}
 8119 
 8120 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8121   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8122   match(Set dst (ExtractF src idx));
 8123   effect(TEMP dst, TEMP vtmp);
 8124   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8125   ins_encode %{
 8126     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8127 
 8128     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8129   %}
 8130   ins_pipe( pipe_slow );
 8131 %}
 8132 
 8133 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8134   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8135             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8136   match(Set dst (ExtractF src idx));
 8137   effect(TEMP vtmp);
 8138   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8139   ins_encode %{
 8140     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8141 
 8142     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8143     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8144   %}
 8145   ins_pipe( pipe_slow );
 8146 %}
 8147 
 8148 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8149   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8150   match(Set dst (ExtractD src idx));
 8151   format %{ "extractD $dst,$src,$idx\t!" %}
 8152   ins_encode %{
 8153     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8154 
 8155     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8156   %}
 8157   ins_pipe( pipe_slow );
 8158 %}
 8159 
 8160 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8161   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8162             Matcher::vector_length(n->in(1)) == 8);  // src
 8163   match(Set dst (ExtractD src idx));
 8164   effect(TEMP vtmp);
 8165   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8166   ins_encode %{
 8167     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8168 
 8169     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8170     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8171   %}
 8172   ins_pipe( pipe_slow );
 8173 %}
 8174 
 8175 // --------------------------------- Vector Blend --------------------------------------
 8176 
 8177 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8178   predicate(UseAVX == 0);
 8179   match(Set dst (VectorBlend (Binary dst src) mask));
 8180   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8181   effect(TEMP tmp);
 8182   ins_encode %{
 8183     assert(UseSSE >= 4, "required");
 8184 
 8185     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8186       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8187     }
 8188     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8189   %}
 8190   ins_pipe( pipe_slow );
 8191 %}
 8192 
 8193 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8194   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8195             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8196             Matcher::vector_length_in_bytes(n) <= 32 &&
 8197             is_integral_type(Matcher::vector_element_basic_type(n)));
 8198   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8199   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8200   ins_encode %{
 8201     int vlen_enc = vector_length_encoding(this);
 8202     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8203   %}
 8204   ins_pipe( pipe_slow );
 8205 %}
 8206 
 8207 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8208   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8209             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8210             Matcher::vector_length_in_bytes(n) <= 32 &&
 8211             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8212   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8213   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8214   ins_encode %{
 8215     int vlen_enc = vector_length_encoding(this);
 8216     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8217   %}
 8218   ins_pipe( pipe_slow );
 8219 %}
 8220 
 8221 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8222   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8223             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8224             Matcher::vector_length_in_bytes(n) <= 32);
 8225   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8226   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8227   effect(TEMP vtmp, TEMP dst);
 8228   ins_encode %{
 8229     int vlen_enc = vector_length_encoding(this);
 8230     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8231     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8232     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8233   %}
 8234   ins_pipe( pipe_slow );
 8235 %}
 8236 
 8237 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8238   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8239             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8240   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8241   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8242   effect(TEMP ktmp);
 8243   ins_encode %{
 8244      int vlen_enc = Assembler::AVX_512bit;
 8245      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8246     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8247     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8248   %}
 8249   ins_pipe( pipe_slow );
 8250 %}
 8251 
 8252 
 8253 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8254   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8255             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8256              VM_Version::supports_avx512bw()));
 8257   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8258   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8259   ins_encode %{
 8260     int vlen_enc = vector_length_encoding(this);
 8261     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8262     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8263   %}
 8264   ins_pipe( pipe_slow );
 8265 %}
 8266 
 8267 // --------------------------------- ABS --------------------------------------
 8268 // a = |a|
 8269 instruct vabsB_reg(vec dst, vec src) %{
 8270   match(Set dst (AbsVB  src));
 8271   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8272   ins_encode %{
 8273     uint vlen = Matcher::vector_length(this);
 8274     if (vlen <= 16) {
 8275       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8276     } else {
 8277       int vlen_enc = vector_length_encoding(this);
 8278       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8279     }
 8280   %}
 8281   ins_pipe( pipe_slow );
 8282 %}
 8283 
 8284 instruct vabsS_reg(vec dst, vec src) %{
 8285   match(Set dst (AbsVS  src));
 8286   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8287   ins_encode %{
 8288     uint vlen = Matcher::vector_length(this);
 8289     if (vlen <= 8) {
 8290       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8291     } else {
 8292       int vlen_enc = vector_length_encoding(this);
 8293       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8294     }
 8295   %}
 8296   ins_pipe( pipe_slow );
 8297 %}
 8298 
 8299 instruct vabsI_reg(vec dst, vec src) %{
 8300   match(Set dst (AbsVI  src));
 8301   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8302   ins_encode %{
 8303     uint vlen = Matcher::vector_length(this);
 8304     if (vlen <= 4) {
 8305       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8306     } else {
 8307       int vlen_enc = vector_length_encoding(this);
 8308       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8309     }
 8310   %}
 8311   ins_pipe( pipe_slow );
 8312 %}
 8313 
 8314 instruct vabsL_reg(vec dst, vec src) %{
 8315   match(Set dst (AbsVL  src));
 8316   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8317   ins_encode %{
 8318     assert(UseAVX > 2, "required");
 8319     int vlen_enc = vector_length_encoding(this);
 8320     if (!VM_Version::supports_avx512vl()) {
 8321       vlen_enc = Assembler::AVX_512bit;
 8322     }
 8323     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8324   %}
 8325   ins_pipe( pipe_slow );
 8326 %}
 8327 
 8328 // --------------------------------- ABSNEG --------------------------------------
 8329 
 8330 instruct vabsnegF(vec dst, vec src) %{
 8331   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8332   match(Set dst (AbsVF src));
 8333   match(Set dst (NegVF src));
 8334   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8335   ins_cost(150);
 8336   ins_encode %{
 8337     int opcode = this->ideal_Opcode();
 8338     int vlen = Matcher::vector_length(this);
 8339     if (vlen == 2) {
 8340       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8341     } else {
 8342       assert(vlen == 8 || vlen == 16, "required");
 8343       int vlen_enc = vector_length_encoding(this);
 8344       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8345     }
 8346   %}
 8347   ins_pipe( pipe_slow );
 8348 %}
 8349 
 8350 instruct vabsneg4F(vec dst) %{
 8351   predicate(Matcher::vector_length(n) == 4);
 8352   match(Set dst (AbsVF dst));
 8353   match(Set dst (NegVF dst));
 8354   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8355   ins_cost(150);
 8356   ins_encode %{
 8357     int opcode = this->ideal_Opcode();
 8358     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8359   %}
 8360   ins_pipe( pipe_slow );
 8361 %}
 8362 
 8363 instruct vabsnegD(vec dst, vec src) %{
 8364   match(Set dst (AbsVD  src));
 8365   match(Set dst (NegVD  src));
 8366   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8367   ins_encode %{
 8368     int opcode = this->ideal_Opcode();
 8369     uint vlen = Matcher::vector_length(this);
 8370     if (vlen == 2) {
 8371       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8372     } else {
 8373       int vlen_enc = vector_length_encoding(this);
 8374       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8375     }
 8376   %}
 8377   ins_pipe( pipe_slow );
 8378 %}
 8379 
 8380 //------------------------------------- VectorTest --------------------------------------------
 8381 
 8382 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8383   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8384   match(Set cr (VectorTest src1 src2));
 8385   effect(TEMP vtmp);
 8386   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8387   ins_encode %{
 8388     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8389     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8390     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8391   %}
 8392   ins_pipe( pipe_slow );
 8393 %}
 8394 
 8395 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8396   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8397   match(Set cr (VectorTest src1 src2));
 8398   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8399   ins_encode %{
 8400     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8401     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8402     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8403   %}
 8404   ins_pipe( pipe_slow );
 8405 %}
 8406 
 8407 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8408   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8409              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8410             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8411   match(Set cr (VectorTest src1 src2));
 8412   effect(TEMP tmp);
 8413   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8414   ins_encode %{
 8415     uint masklen = Matcher::vector_length(this, $src1);
 8416     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8417     __ andl($tmp$$Register, (1 << masklen) - 1);
 8418     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8419   %}
 8420   ins_pipe( pipe_slow );
 8421 %}
 8422 
 8423 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8424   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8425              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8426             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8427   match(Set cr (VectorTest src1 src2));
 8428   effect(TEMP tmp);
 8429   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8430   ins_encode %{
 8431     uint masklen = Matcher::vector_length(this, $src1);
 8432     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8433     __ andl($tmp$$Register, (1 << masklen) - 1);
 8434   %}
 8435   ins_pipe( pipe_slow );
 8436 %}
 8437 
 8438 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8439   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8440             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8441   match(Set cr (VectorTest src1 src2));
 8442   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8443   ins_encode %{
 8444     uint masklen = Matcher::vector_length(this, $src1);
 8445     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8446   %}
 8447   ins_pipe( pipe_slow );
 8448 %}
 8449 
 8450 //------------------------------------- LoadMask --------------------------------------------
 8451 
 8452 instruct loadMask(legVec dst, legVec src) %{
 8453   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8454   match(Set dst (VectorLoadMask src));
 8455   effect(TEMP dst);
 8456   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8457   ins_encode %{
 8458     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8459     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8460     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8461   %}
 8462   ins_pipe( pipe_slow );
 8463 %}
 8464 
 8465 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8466   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8467   match(Set dst (VectorLoadMask src));
 8468   effect(TEMP xtmp);
 8469   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8470   ins_encode %{
 8471     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8472                         true, Assembler::AVX_512bit);
 8473   %}
 8474   ins_pipe( pipe_slow );
 8475 %}
 8476 
 8477 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8478   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8479   match(Set dst (VectorLoadMask src));
 8480   effect(TEMP xtmp);
 8481   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8482   ins_encode %{
 8483     int vlen_enc = vector_length_encoding(in(1));
 8484     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8485                         false, vlen_enc);
 8486   %}
 8487   ins_pipe( pipe_slow );
 8488 %}
 8489 
 8490 //------------------------------------- StoreMask --------------------------------------------
 8491 
 8492 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8493   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8494   match(Set dst (VectorStoreMask src size));
 8495   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8496   ins_encode %{
 8497     int vlen = Matcher::vector_length(this);
 8498     if (vlen <= 16 && UseAVX <= 2) {
 8499       assert(UseSSE >= 3, "required");
 8500       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8501     } else {
 8502       assert(UseAVX > 0, "required");
 8503       int src_vlen_enc = vector_length_encoding(this, $src);
 8504       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8505     }
 8506   %}
 8507   ins_pipe( pipe_slow );
 8508 %}
 8509 
 8510 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8511   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8512   match(Set dst (VectorStoreMask src size));
 8513   effect(TEMP_DEF dst, TEMP xtmp);
 8514   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8515   ins_encode %{
 8516     int vlen_enc = Assembler::AVX_128bit;
 8517     int vlen = Matcher::vector_length(this);
 8518     if (vlen <= 8) {
 8519       assert(UseSSE >= 3, "required");
 8520       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8521       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8522       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8523     } else {
 8524       assert(UseAVX > 0, "required");
 8525       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8526       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8527       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8528     }
 8529   %}
 8530   ins_pipe( pipe_slow );
 8531 %}
 8532 
 8533 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8534   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8535   match(Set dst (VectorStoreMask src size));
 8536   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8537   effect(TEMP_DEF dst, TEMP xtmp);
 8538   ins_encode %{
 8539     int vlen_enc = Assembler::AVX_128bit;
 8540     int vlen = Matcher::vector_length(this);
 8541     if (vlen <= 4) {
 8542       assert(UseSSE >= 3, "required");
 8543       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8544       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8545       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8546       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8547     } else {
 8548       assert(UseAVX > 0, "required");
 8549       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8550       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8551       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8552       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8553       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8554     }
 8555   %}
 8556   ins_pipe( pipe_slow );
 8557 %}
 8558 
 8559 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8560   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8561   match(Set dst (VectorStoreMask src size));
 8562   effect(TEMP_DEF dst, TEMP xtmp);
 8563   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8564   ins_encode %{
 8565     assert(UseSSE >= 3, "required");
 8566     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8567     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8568     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8569     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8570     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8571   %}
 8572   ins_pipe( pipe_slow );
 8573 %}
 8574 
 8575 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8576   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8577   match(Set dst (VectorStoreMask src size));
 8578   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8579   effect(TEMP_DEF dst, TEMP vtmp);
 8580   ins_encode %{
 8581     int vlen_enc = Assembler::AVX_128bit;
 8582     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8583     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8584     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8585     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8586     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8587     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8588     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8589   %}
 8590   ins_pipe( pipe_slow );
 8591 %}
 8592 
 8593 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8594   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8595   match(Set dst (VectorStoreMask src size));
 8596   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8597   ins_encode %{
 8598     int src_vlen_enc = vector_length_encoding(this, $src);
 8599     int dst_vlen_enc = vector_length_encoding(this);
 8600     if (!VM_Version::supports_avx512vl()) {
 8601       src_vlen_enc = Assembler::AVX_512bit;
 8602     }
 8603     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8604     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8605   %}
 8606   ins_pipe( pipe_slow );
 8607 %}
 8608 
 8609 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8610   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8611   match(Set dst (VectorStoreMask src size));
 8612   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8613   ins_encode %{
 8614     int src_vlen_enc = vector_length_encoding(this, $src);
 8615     int dst_vlen_enc = vector_length_encoding(this);
 8616     if (!VM_Version::supports_avx512vl()) {
 8617       src_vlen_enc = Assembler::AVX_512bit;
 8618     }
 8619     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8620     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8621   %}
 8622   ins_pipe( pipe_slow );
 8623 %}
 8624 
 8625 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8626   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8627   match(Set dst (VectorStoreMask mask size));
 8628   effect(TEMP_DEF dst);
 8629   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8630   ins_encode %{
 8631     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8632     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8633                  false, Assembler::AVX_512bit, noreg);
 8634     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8635   %}
 8636   ins_pipe( pipe_slow );
 8637 %}
 8638 
 8639 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8640   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8641   match(Set dst (VectorStoreMask mask size));
 8642   effect(TEMP_DEF dst);
 8643   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8644   ins_encode %{
 8645     int dst_vlen_enc = vector_length_encoding(this);
 8646     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8647     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8648   %}
 8649   ins_pipe( pipe_slow );
 8650 %}
 8651 
 8652 instruct vmaskcast_evex(kReg dst) %{
 8653   match(Set dst (VectorMaskCast dst));
 8654   ins_cost(0);
 8655   format %{ "vector_mask_cast $dst" %}
 8656   ins_encode %{
 8657     // empty
 8658   %}
 8659   ins_pipe(empty);
 8660 %}
 8661 
 8662 instruct vmaskcast(vec dst) %{
 8663   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8664   match(Set dst (VectorMaskCast dst));
 8665   ins_cost(0);
 8666   format %{ "vector_mask_cast $dst" %}
 8667   ins_encode %{
 8668     // empty
 8669   %}
 8670   ins_pipe(empty);
 8671 %}
 8672 
 8673 instruct vmaskcast_avx(vec dst, vec src) %{
 8674   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8675   match(Set dst (VectorMaskCast src));
 8676   format %{ "vector_mask_cast $dst, $src" %}
 8677   ins_encode %{
 8678     int vlen = Matcher::vector_length(this);
 8679     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8680     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8681     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8682   %}
 8683   ins_pipe(pipe_slow);
 8684 %}
 8685 
 8686 //-------------------------------- Load Iota Indices ----------------------------------
 8687 
 8688 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8689   match(Set dst (VectorLoadConst src));
 8690   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8691   ins_encode %{
 8692      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8693      BasicType bt = Matcher::vector_element_basic_type(this);
 8694      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8695   %}
 8696   ins_pipe( pipe_slow );
 8697 %}
 8698 
 8699 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8700   match(Set dst (PopulateIndex src1 src2));
 8701   effect(TEMP dst, TEMP vtmp);
 8702   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8703   ins_encode %{
 8704      assert($src2$$constant == 1, "required");
 8705      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8706      int vlen_enc = vector_length_encoding(this);
 8707      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8708      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8709      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8710      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8711   %}
 8712   ins_pipe( pipe_slow );
 8713 %}
 8714 
 8715 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8716   match(Set dst (PopulateIndex src1 src2));
 8717   effect(TEMP dst, TEMP vtmp);
 8718   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8719   ins_encode %{
 8720      assert($src2$$constant == 1, "required");
 8721      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8722      int vlen_enc = vector_length_encoding(this);
 8723      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8724      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8725      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8726      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8727   %}
 8728   ins_pipe( pipe_slow );
 8729 %}
 8730 
 8731 //-------------------------------- Rearrange ----------------------------------
 8732 
 8733 // LoadShuffle/Rearrange for Byte
 8734 instruct rearrangeB(vec dst, vec shuffle) %{
 8735   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8736             Matcher::vector_length(n) < 32);
 8737   match(Set dst (VectorRearrange dst shuffle));
 8738   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8739   ins_encode %{
 8740     assert(UseSSE >= 4, "required");
 8741     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8742   %}
 8743   ins_pipe( pipe_slow );
 8744 %}
 8745 
 8746 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8747   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8748             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8749   match(Set dst (VectorRearrange src shuffle));
 8750   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8751   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8752   ins_encode %{
 8753     assert(UseAVX >= 2, "required");
 8754     // Swap src into vtmp1
 8755     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8756     // Shuffle swapped src to get entries from other 128 bit lane
 8757     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8758     // Shuffle original src to get entries from self 128 bit lane
 8759     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8760     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8761     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8762     // Perform the blend
 8763     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8764   %}
 8765   ins_pipe( pipe_slow );
 8766 %}
 8767 
 8768 
 8769 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8770   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8771             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8772   match(Set dst (VectorRearrange src shuffle));
 8773   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8774   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8775   ins_encode %{
 8776     int vlen_enc = vector_length_encoding(this);
 8777     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8778                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8779                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8780   %}
 8781   ins_pipe( pipe_slow );
 8782 %}
 8783 
 8784 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8785   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8786             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8787   match(Set dst (VectorRearrange src shuffle));
 8788   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8789   ins_encode %{
 8790     int vlen_enc = vector_length_encoding(this);
 8791     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8792   %}
 8793   ins_pipe( pipe_slow );
 8794 %}
 8795 
 8796 // LoadShuffle/Rearrange for Short
 8797 
 8798 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8799   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8800             !VM_Version::supports_avx512bw());
 8801   match(Set dst (VectorLoadShuffle src));
 8802   effect(TEMP dst, TEMP vtmp);
 8803   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8804   ins_encode %{
 8805     // Create a byte shuffle mask from short shuffle mask
 8806     // only byte shuffle instruction available on these platforms
 8807     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8808     if (UseAVX == 0) {
 8809       assert(vlen_in_bytes <= 16, "required");
 8810       // Multiply each shuffle by two to get byte index
 8811       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8812       __ psllw($vtmp$$XMMRegister, 1);
 8813 
 8814       // Duplicate to create 2 copies of byte index
 8815       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8816       __ psllw($dst$$XMMRegister, 8);
 8817       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8818 
 8819       // Add one to get alternate byte index
 8820       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8821       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8822     } else {
 8823       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8824       int vlen_enc = vector_length_encoding(this);
 8825       // Multiply each shuffle by two to get byte index
 8826       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8827 
 8828       // Duplicate to create 2 copies of byte index
 8829       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8830       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8831 
 8832       // Add one to get alternate byte index
 8833       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8834     }
 8835   %}
 8836   ins_pipe( pipe_slow );
 8837 %}
 8838 
 8839 instruct rearrangeS(vec dst, vec shuffle) %{
 8840   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8841             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8842   match(Set dst (VectorRearrange dst shuffle));
 8843   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8844   ins_encode %{
 8845     assert(UseSSE >= 4, "required");
 8846     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8847   %}
 8848   ins_pipe( pipe_slow );
 8849 %}
 8850 
 8851 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8852   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8853             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8854   match(Set dst (VectorRearrange src shuffle));
 8855   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8856   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8857   ins_encode %{
 8858     assert(UseAVX >= 2, "required");
 8859     // Swap src into vtmp1
 8860     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8861     // Shuffle swapped src to get entries from other 128 bit lane
 8862     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8863     // Shuffle original src to get entries from self 128 bit lane
 8864     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8865     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8866     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8867     // Perform the blend
 8868     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8869   %}
 8870   ins_pipe( pipe_slow );
 8871 %}
 8872 
 8873 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8874   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8875             VM_Version::supports_avx512bw());
 8876   match(Set dst (VectorRearrange src shuffle));
 8877   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8878   ins_encode %{
 8879     int vlen_enc = vector_length_encoding(this);
 8880     if (!VM_Version::supports_avx512vl()) {
 8881       vlen_enc = Assembler::AVX_512bit;
 8882     }
 8883     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8884   %}
 8885   ins_pipe( pipe_slow );
 8886 %}
 8887 
 8888 // LoadShuffle/Rearrange for Integer and Float
 8889 
 8890 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8891   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8892             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8893   match(Set dst (VectorLoadShuffle src));
 8894   effect(TEMP dst, TEMP vtmp);
 8895   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8896   ins_encode %{
 8897     assert(UseSSE >= 4, "required");
 8898 
 8899     // Create a byte shuffle mask from int shuffle mask
 8900     // only byte shuffle instruction available on these platforms
 8901 
 8902     // Duplicate and multiply each shuffle by 4
 8903     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8904     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8905     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8906     __ psllw($vtmp$$XMMRegister, 2);
 8907 
 8908     // Duplicate again to create 4 copies of byte index
 8909     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8910     __ psllw($dst$$XMMRegister, 8);
 8911     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8912 
 8913     // Add 3,2,1,0 to get alternate byte index
 8914     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8915     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8916   %}
 8917   ins_pipe( pipe_slow );
 8918 %}
 8919 
 8920 instruct rearrangeI(vec dst, vec shuffle) %{
 8921   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8922             UseAVX == 0);
 8923   match(Set dst (VectorRearrange dst shuffle));
 8924   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8925   ins_encode %{
 8926     assert(UseSSE >= 4, "required");
 8927     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8928   %}
 8929   ins_pipe( pipe_slow );
 8930 %}
 8931 
 8932 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8933   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8934             UseAVX > 0);
 8935   match(Set dst (VectorRearrange src shuffle));
 8936   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8937   ins_encode %{
 8938     int vlen_enc = vector_length_encoding(this);
 8939     BasicType bt = Matcher::vector_element_basic_type(this);
 8940     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8941   %}
 8942   ins_pipe( pipe_slow );
 8943 %}
 8944 
 8945 // LoadShuffle/Rearrange for Long and Double
 8946 
 8947 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8948   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8949             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8950   match(Set dst (VectorLoadShuffle src));
 8951   effect(TEMP dst, TEMP vtmp);
 8952   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8953   ins_encode %{
 8954     assert(UseAVX >= 2, "required");
 8955 
 8956     int vlen_enc = vector_length_encoding(this);
 8957     // Create a double word shuffle mask from long shuffle mask
 8958     // only double word shuffle instruction available on these platforms
 8959 
 8960     // Multiply each shuffle by two to get double word index
 8961     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8962 
 8963     // Duplicate each double word shuffle
 8964     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8965     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8966 
 8967     // Add one to get alternate double word index
 8968     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8969   %}
 8970   ins_pipe( pipe_slow );
 8971 %}
 8972 
 8973 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8974   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8975             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8976   match(Set dst (VectorRearrange src shuffle));
 8977   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8978   ins_encode %{
 8979     assert(UseAVX >= 2, "required");
 8980 
 8981     int vlen_enc = vector_length_encoding(this);
 8982     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8983   %}
 8984   ins_pipe( pipe_slow );
 8985 %}
 8986 
 8987 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8988   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8989             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8990   match(Set dst (VectorRearrange src shuffle));
 8991   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8992   ins_encode %{
 8993     assert(UseAVX > 2, "required");
 8994 
 8995     int vlen_enc = vector_length_encoding(this);
 8996     if (vlen_enc == Assembler::AVX_128bit) {
 8997       vlen_enc = Assembler::AVX_256bit;
 8998     }
 8999     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9000   %}
 9001   ins_pipe( pipe_slow );
 9002 %}
 9003 
 9004 // --------------------------------- FMA --------------------------------------
 9005 // a * b + c
 9006 
 9007 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9008   match(Set c (FmaVF  c (Binary a b)));
 9009   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9010   ins_cost(150);
 9011   ins_encode %{
 9012     assert(UseFMA, "not enabled");
 9013     int vlen_enc = vector_length_encoding(this);
 9014     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9015   %}
 9016   ins_pipe( pipe_slow );
 9017 %}
 9018 
 9019 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9020   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9021   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9022   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9023   ins_cost(150);
 9024   ins_encode %{
 9025     assert(UseFMA, "not enabled");
 9026     int vlen_enc = vector_length_encoding(this);
 9027     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9028   %}
 9029   ins_pipe( pipe_slow );
 9030 %}
 9031 
 9032 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9033   match(Set c (FmaVD  c (Binary a b)));
 9034   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9035   ins_cost(150);
 9036   ins_encode %{
 9037     assert(UseFMA, "not enabled");
 9038     int vlen_enc = vector_length_encoding(this);
 9039     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9040   %}
 9041   ins_pipe( pipe_slow );
 9042 %}
 9043 
 9044 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9045   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9046   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9047   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9048   ins_cost(150);
 9049   ins_encode %{
 9050     assert(UseFMA, "not enabled");
 9051     int vlen_enc = vector_length_encoding(this);
 9052     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9053   %}
 9054   ins_pipe( pipe_slow );
 9055 %}
 9056 
 9057 // --------------------------------- Vector Multiply Add --------------------------------------
 9058 
 9059 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9060   predicate(UseAVX == 0);
 9061   match(Set dst (MulAddVS2VI dst src1));
 9062   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9063   ins_encode %{
 9064     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9065   %}
 9066   ins_pipe( pipe_slow );
 9067 %}
 9068 
 9069 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9070   predicate(UseAVX > 0);
 9071   match(Set dst (MulAddVS2VI src1 src2));
 9072   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9073   ins_encode %{
 9074     int vlen_enc = vector_length_encoding(this);
 9075     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9076   %}
 9077   ins_pipe( pipe_slow );
 9078 %}
 9079 
 9080 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9081 
 9082 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9083   predicate(VM_Version::supports_avx512_vnni());
 9084   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9085   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9086   ins_encode %{
 9087     assert(UseAVX > 2, "required");
 9088     int vlen_enc = vector_length_encoding(this);
 9089     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9090   %}
 9091   ins_pipe( pipe_slow );
 9092   ins_cost(10);
 9093 %}
 9094 
 9095 // --------------------------------- PopCount --------------------------------------
 9096 
 9097 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9098   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9099   match(Set dst (PopCountVI src));
 9100   match(Set dst (PopCountVL src));
 9101   format %{ "vector_popcount_integral $dst, $src" %}
 9102   ins_encode %{
 9103     int opcode = this->ideal_Opcode();
 9104     int vlen_enc = vector_length_encoding(this, $src);
 9105     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9106     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9107   %}
 9108   ins_pipe( pipe_slow );
 9109 %}
 9110 
 9111 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9112   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9113   match(Set dst (PopCountVI src mask));
 9114   match(Set dst (PopCountVL src mask));
 9115   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9116   ins_encode %{
 9117     int vlen_enc = vector_length_encoding(this, $src);
 9118     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9119     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9120     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9121   %}
 9122   ins_pipe( pipe_slow );
 9123 %}
 9124 
 9125 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9126   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9127   match(Set dst (PopCountVI src));
 9128   match(Set dst (PopCountVL src));
 9129   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9130   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9131   ins_encode %{
 9132     int opcode = this->ideal_Opcode();
 9133     int vlen_enc = vector_length_encoding(this, $src);
 9134     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9135     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9136                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9137   %}
 9138   ins_pipe( pipe_slow );
 9139 %}
 9140 
 9141 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9142 
 9143 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9144   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9145                                               Matcher::vector_length_in_bytes(n->in(1))));
 9146   match(Set dst (CountTrailingZerosV src));
 9147   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9148   ins_cost(400);
 9149   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9150   ins_encode %{
 9151     int vlen_enc = vector_length_encoding(this, $src);
 9152     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9153     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9154                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9155   %}
 9156   ins_pipe( pipe_slow );
 9157 %}
 9158 
 9159 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9160   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9161             VM_Version::supports_avx512cd() &&
 9162             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9163   match(Set dst (CountTrailingZerosV src));
 9164   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9165   ins_cost(400);
 9166   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9167   ins_encode %{
 9168     int vlen_enc = vector_length_encoding(this, $src);
 9169     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9170     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9171                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9172   %}
 9173   ins_pipe( pipe_slow );
 9174 %}
 9175 
 9176 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9177   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9178   match(Set dst (CountTrailingZerosV src));
 9179   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9180   ins_cost(400);
 9181   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9182   ins_encode %{
 9183     int vlen_enc = vector_length_encoding(this, $src);
 9184     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9185     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9186                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9187                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9188   %}
 9189   ins_pipe( pipe_slow );
 9190 %}
 9191 
 9192 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9193   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9194   match(Set dst (CountTrailingZerosV src));
 9195   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9196   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9197   ins_encode %{
 9198     int vlen_enc = vector_length_encoding(this, $src);
 9199     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9200     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9201                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9202   %}
 9203   ins_pipe( pipe_slow );
 9204 %}
 9205 
 9206 
 9207 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9208 
 9209 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9210   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9211   effect(TEMP dst);
 9212   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9213   ins_encode %{
 9214     int vector_len = vector_length_encoding(this);
 9215     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9216   %}
 9217   ins_pipe( pipe_slow );
 9218 %}
 9219 
 9220 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9221   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9222   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9223   effect(TEMP dst);
 9224   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9225   ins_encode %{
 9226     int vector_len = vector_length_encoding(this);
 9227     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9228   %}
 9229   ins_pipe( pipe_slow );
 9230 %}
 9231 
 9232 // --------------------------------- Rotation Operations ----------------------------------
 9233 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9234   match(Set dst (RotateLeftV src shift));
 9235   match(Set dst (RotateRightV src shift));
 9236   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9237   ins_encode %{
 9238     int opcode      = this->ideal_Opcode();
 9239     int vector_len  = vector_length_encoding(this);
 9240     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9241     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9242   %}
 9243   ins_pipe( pipe_slow );
 9244 %}
 9245 
 9246 instruct vprorate(vec dst, vec src, vec shift) %{
 9247   match(Set dst (RotateLeftV src shift));
 9248   match(Set dst (RotateRightV src shift));
 9249   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9250   ins_encode %{
 9251     int opcode      = this->ideal_Opcode();
 9252     int vector_len  = vector_length_encoding(this);
 9253     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9254     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9255   %}
 9256   ins_pipe( pipe_slow );
 9257 %}
 9258 
 9259 // ---------------------------------- Masked Operations ------------------------------------
 9260 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9261   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9262   match(Set dst (LoadVectorMasked mem mask));
 9263   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9264   ins_encode %{
 9265     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9266     int vlen_enc = vector_length_encoding(this);
 9267     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9268   %}
 9269   ins_pipe( pipe_slow );
 9270 %}
 9271 
 9272 
 9273 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9274   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9275   match(Set dst (LoadVectorMasked mem mask));
 9276   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9277   ins_encode %{
 9278     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9279     int vector_len = vector_length_encoding(this);
 9280     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9281   %}
 9282   ins_pipe( pipe_slow );
 9283 %}
 9284 
 9285 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9286   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9287   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9288   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9289   ins_encode %{
 9290     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9291     int vlen_enc = vector_length_encoding(src_node);
 9292     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9293     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9294   %}
 9295   ins_pipe( pipe_slow );
 9296 %}
 9297 
 9298 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9299   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9300   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9301   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9302   ins_encode %{
 9303     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9304     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9305     int vlen_enc = vector_length_encoding(src_node);
 9306     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9307   %}
 9308   ins_pipe( pipe_slow );
 9309 %}
 9310 
 9311 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9312   match(Set addr (VerifyVectorAlignment addr mask));
 9313   effect(KILL cr);
 9314   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9315   ins_encode %{
 9316     Label Lskip;
 9317     // check if masked bits of addr are zero
 9318     __ testq($addr$$Register, $mask$$constant);
 9319     __ jccb(Assembler::equal, Lskip);
 9320     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9321     __ bind(Lskip);
 9322   %}
 9323   ins_pipe(pipe_slow);
 9324 %}
 9325 
 9326 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9327   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9328   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9329   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9330   ins_encode %{
 9331     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9332     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9333 
 9334     Label DONE;
 9335     int vlen_enc = vector_length_encoding(this, $src1);
 9336     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9337 
 9338     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9339     __ mov64($dst$$Register, -1L);
 9340     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9341     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9342     __ jccb(Assembler::carrySet, DONE);
 9343     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9344     __ notq($dst$$Register);
 9345     __ tzcntq($dst$$Register, $dst$$Register);
 9346     __ bind(DONE);
 9347   %}
 9348   ins_pipe( pipe_slow );
 9349 %}
 9350 
 9351 
 9352 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9353   match(Set dst (VectorMaskGen len));
 9354   effect(TEMP temp, KILL cr);
 9355   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9356   ins_encode %{
 9357     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9358   %}
 9359   ins_pipe( pipe_slow );
 9360 %}
 9361 
 9362 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9363   match(Set dst (VectorMaskGen len));
 9364   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9365   effect(TEMP temp);
 9366   ins_encode %{
 9367     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9368     __ kmovql($dst$$KRegister, $temp$$Register);
 9369   %}
 9370   ins_pipe( pipe_slow );
 9371 %}
 9372 
 9373 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9374   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9375   match(Set dst (VectorMaskToLong mask));
 9376   effect(TEMP dst, KILL cr);
 9377   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9378   ins_encode %{
 9379     int opcode = this->ideal_Opcode();
 9380     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9381     int mask_len = Matcher::vector_length(this, $mask);
 9382     int mask_size = mask_len * type2aelembytes(mbt);
 9383     int vlen_enc = vector_length_encoding(this, $mask);
 9384     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9385                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9386   %}
 9387   ins_pipe( pipe_slow );
 9388 %}
 9389 
 9390 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9391   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9392   match(Set dst (VectorMaskToLong mask));
 9393   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9394   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9395   ins_encode %{
 9396     int opcode = this->ideal_Opcode();
 9397     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9398     int mask_len = Matcher::vector_length(this, $mask);
 9399     int vlen_enc = vector_length_encoding(this, $mask);
 9400     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9401                              $dst$$Register, mask_len, mbt, vlen_enc);
 9402   %}
 9403   ins_pipe( pipe_slow );
 9404 %}
 9405 
 9406 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9407   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9408   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9409   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9410   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9411   ins_encode %{
 9412     int opcode = this->ideal_Opcode();
 9413     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9414     int mask_len = Matcher::vector_length(this, $mask);
 9415     int vlen_enc = vector_length_encoding(this, $mask);
 9416     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9417                              $dst$$Register, mask_len, mbt, vlen_enc);
 9418   %}
 9419   ins_pipe( pipe_slow );
 9420 %}
 9421 
 9422 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9423   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9424   match(Set dst (VectorMaskTrueCount mask));
 9425   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9426   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9427   ins_encode %{
 9428     int opcode = this->ideal_Opcode();
 9429     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9430     int mask_len = Matcher::vector_length(this, $mask);
 9431     int mask_size = mask_len * type2aelembytes(mbt);
 9432     int vlen_enc = vector_length_encoding(this, $mask);
 9433     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9434                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9435   %}
 9436   ins_pipe( pipe_slow );
 9437 %}
 9438 
 9439 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9440   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9441   match(Set dst (VectorMaskTrueCount mask));
 9442   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9443   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9444   ins_encode %{
 9445     int opcode = this->ideal_Opcode();
 9446     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9447     int mask_len = Matcher::vector_length(this, $mask);
 9448     int vlen_enc = vector_length_encoding(this, $mask);
 9449     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9450                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9451   %}
 9452   ins_pipe( pipe_slow );
 9453 %}
 9454 
 9455 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9456   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9457   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9458   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9459   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9460   ins_encode %{
 9461     int opcode = this->ideal_Opcode();
 9462     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9463     int mask_len = Matcher::vector_length(this, $mask);
 9464     int vlen_enc = vector_length_encoding(this, $mask);
 9465     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9466                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9467   %}
 9468   ins_pipe( pipe_slow );
 9469 %}
 9470 
 9471 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9472   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9473   match(Set dst (VectorMaskFirstTrue mask));
 9474   match(Set dst (VectorMaskLastTrue mask));
 9475   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9476   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9477   ins_encode %{
 9478     int opcode = this->ideal_Opcode();
 9479     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9480     int mask_len = Matcher::vector_length(this, $mask);
 9481     int mask_size = mask_len * type2aelembytes(mbt);
 9482     int vlen_enc = vector_length_encoding(this, $mask);
 9483     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9484                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9485   %}
 9486   ins_pipe( pipe_slow );
 9487 %}
 9488 
 9489 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9490   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9491   match(Set dst (VectorMaskFirstTrue mask));
 9492   match(Set dst (VectorMaskLastTrue mask));
 9493   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9494   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9495   ins_encode %{
 9496     int opcode = this->ideal_Opcode();
 9497     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9498     int mask_len = Matcher::vector_length(this, $mask);
 9499     int vlen_enc = vector_length_encoding(this, $mask);
 9500     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9501                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9502   %}
 9503   ins_pipe( pipe_slow );
 9504 %}
 9505 
 9506 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9507   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9508   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9509   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9510   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9511   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9512   ins_encode %{
 9513     int opcode = this->ideal_Opcode();
 9514     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9515     int mask_len = Matcher::vector_length(this, $mask);
 9516     int vlen_enc = vector_length_encoding(this, $mask);
 9517     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9518                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9519   %}
 9520   ins_pipe( pipe_slow );
 9521 %}
 9522 
 9523 // --------------------------------- Compress/Expand Operations ---------------------------
 9524 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9525   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9526   match(Set dst (CompressV src mask));
 9527   match(Set dst (ExpandV src mask));
 9528   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9529   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9530   ins_encode %{
 9531     int opcode = this->ideal_Opcode();
 9532     int vlen_enc = vector_length_encoding(this);
 9533     BasicType bt  = Matcher::vector_element_basic_type(this);
 9534     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9535                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9536   %}
 9537   ins_pipe( pipe_slow );
 9538 %}
 9539 
 9540 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9541   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9542   match(Set dst (CompressV src mask));
 9543   match(Set dst (ExpandV src mask));
 9544   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9545   ins_encode %{
 9546     int opcode = this->ideal_Opcode();
 9547     int vector_len = vector_length_encoding(this);
 9548     BasicType bt  = Matcher::vector_element_basic_type(this);
 9549     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9550   %}
 9551   ins_pipe( pipe_slow );
 9552 %}
 9553 
 9554 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9555   match(Set dst (CompressM mask));
 9556   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9557   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9558   ins_encode %{
 9559     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9560     int mask_len = Matcher::vector_length(this);
 9561     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9562   %}
 9563   ins_pipe( pipe_slow );
 9564 %}
 9565 
 9566 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9567 
 9568 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9569   predicate(!VM_Version::supports_gfni());
 9570   match(Set dst (ReverseV src));
 9571   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9572   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9573   ins_encode %{
 9574     int vec_enc = vector_length_encoding(this);
 9575     BasicType bt = Matcher::vector_element_basic_type(this);
 9576     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9577                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9578   %}
 9579   ins_pipe( pipe_slow );
 9580 %}
 9581 
 9582 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9583   predicate(VM_Version::supports_gfni());
 9584   match(Set dst (ReverseV src));
 9585   effect(TEMP dst, TEMP xtmp);
 9586   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9587   ins_encode %{
 9588     int vec_enc = vector_length_encoding(this);
 9589     BasicType bt  = Matcher::vector_element_basic_type(this);
 9590     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9591     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9592                                $xtmp$$XMMRegister);
 9593   %}
 9594   ins_pipe( pipe_slow );
 9595 %}
 9596 
 9597 instruct vreverse_byte_reg(vec dst, vec src) %{
 9598   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9599   match(Set dst (ReverseBytesV src));
 9600   effect(TEMP dst);
 9601   format %{ "vector_reverse_byte $dst, $src" %}
 9602   ins_encode %{
 9603     int vec_enc = vector_length_encoding(this);
 9604     BasicType bt = Matcher::vector_element_basic_type(this);
 9605     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9606   %}
 9607   ins_pipe( pipe_slow );
 9608 %}
 9609 
 9610 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9611   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9612   match(Set dst (ReverseBytesV src));
 9613   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9614   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9615   ins_encode %{
 9616     int vec_enc = vector_length_encoding(this);
 9617     BasicType bt = Matcher::vector_element_basic_type(this);
 9618     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9619                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9620   %}
 9621   ins_pipe( pipe_slow );
 9622 %}
 9623 
 9624 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9625 
 9626 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9627   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9628                                               Matcher::vector_length_in_bytes(n->in(1))));
 9629   match(Set dst (CountLeadingZerosV src));
 9630   format %{ "vector_count_leading_zeros $dst, $src" %}
 9631   ins_encode %{
 9632      int vlen_enc = vector_length_encoding(this, $src);
 9633      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9634      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9635                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9636   %}
 9637   ins_pipe( pipe_slow );
 9638 %}
 9639 
 9640 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9641   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9642                                               Matcher::vector_length_in_bytes(n->in(1))));
 9643   match(Set dst (CountLeadingZerosV src mask));
 9644   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9645   ins_encode %{
 9646     int vlen_enc = vector_length_encoding(this, $src);
 9647     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9648     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9649     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9650                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9651   %}
 9652   ins_pipe( pipe_slow );
 9653 %}
 9654 
 9655 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9656   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9657             VM_Version::supports_avx512cd() &&
 9658             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9659   match(Set dst (CountLeadingZerosV src));
 9660   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9661   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9662   ins_encode %{
 9663     int vlen_enc = vector_length_encoding(this, $src);
 9664     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9665     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9666                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9667   %}
 9668   ins_pipe( pipe_slow );
 9669 %}
 9670 
 9671 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9672   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9673   match(Set dst (CountLeadingZerosV src));
 9674   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9675   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9676   ins_encode %{
 9677     int vlen_enc = vector_length_encoding(this, $src);
 9678     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9679     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9680                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9681                                        $rtmp$$Register, true, vlen_enc);
 9682   %}
 9683   ins_pipe( pipe_slow );
 9684 %}
 9685 
 9686 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9687   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9688             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9689   match(Set dst (CountLeadingZerosV src));
 9690   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9691   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9692   ins_encode %{
 9693     int vlen_enc = vector_length_encoding(this, $src);
 9694     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9695     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9696                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9697   %}
 9698   ins_pipe( pipe_slow );
 9699 %}
 9700 
 9701 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9702   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9703             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9704   match(Set dst (CountLeadingZerosV src));
 9705   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9706   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9707   ins_encode %{
 9708     int vlen_enc = vector_length_encoding(this, $src);
 9709     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9710     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9711                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9712   %}
 9713   ins_pipe( pipe_slow );
 9714 %}
 9715 
 9716 // ---------------------------------- Vector Masked Operations ------------------------------------
 9717 
 9718 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9719   match(Set dst (AddVB (Binary dst src2) mask));
 9720   match(Set dst (AddVS (Binary dst src2) mask));
 9721   match(Set dst (AddVI (Binary dst src2) mask));
 9722   match(Set dst (AddVL (Binary dst src2) mask));
 9723   match(Set dst (AddVF (Binary dst src2) mask));
 9724   match(Set dst (AddVD (Binary dst src2) mask));
 9725   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9726   ins_encode %{
 9727     int vlen_enc = vector_length_encoding(this);
 9728     BasicType bt = Matcher::vector_element_basic_type(this);
 9729     int opc = this->ideal_Opcode();
 9730     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9731                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9732   %}
 9733   ins_pipe( pipe_slow );
 9734 %}
 9735 
 9736 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9737   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9738   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9739   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9740   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9741   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9742   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9743   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9744   ins_encode %{
 9745     int vlen_enc = vector_length_encoding(this);
 9746     BasicType bt = Matcher::vector_element_basic_type(this);
 9747     int opc = this->ideal_Opcode();
 9748     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9749                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9750   %}
 9751   ins_pipe( pipe_slow );
 9752 %}
 9753 
 9754 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9755   match(Set dst (XorV (Binary dst src2) mask));
 9756   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9757   ins_encode %{
 9758     int vlen_enc = vector_length_encoding(this);
 9759     BasicType bt = Matcher::vector_element_basic_type(this);
 9760     int opc = this->ideal_Opcode();
 9761     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9762                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9763   %}
 9764   ins_pipe( pipe_slow );
 9765 %}
 9766 
 9767 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9768   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9769   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9770   ins_encode %{
 9771     int vlen_enc = vector_length_encoding(this);
 9772     BasicType bt = Matcher::vector_element_basic_type(this);
 9773     int opc = this->ideal_Opcode();
 9774     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9775                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9776   %}
 9777   ins_pipe( pipe_slow );
 9778 %}
 9779 
 9780 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9781   match(Set dst (OrV (Binary dst src2) mask));
 9782   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9783   ins_encode %{
 9784     int vlen_enc = vector_length_encoding(this);
 9785     BasicType bt = Matcher::vector_element_basic_type(this);
 9786     int opc = this->ideal_Opcode();
 9787     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9788                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9789   %}
 9790   ins_pipe( pipe_slow );
 9791 %}
 9792 
 9793 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9794   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9795   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9796   ins_encode %{
 9797     int vlen_enc = vector_length_encoding(this);
 9798     BasicType bt = Matcher::vector_element_basic_type(this);
 9799     int opc = this->ideal_Opcode();
 9800     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9801                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9802   %}
 9803   ins_pipe( pipe_slow );
 9804 %}
 9805 
 9806 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9807   match(Set dst (AndV (Binary dst src2) mask));
 9808   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9809   ins_encode %{
 9810     int vlen_enc = vector_length_encoding(this);
 9811     BasicType bt = Matcher::vector_element_basic_type(this);
 9812     int opc = this->ideal_Opcode();
 9813     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9814                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9815   %}
 9816   ins_pipe( pipe_slow );
 9817 %}
 9818 
 9819 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9820   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9821   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9822   ins_encode %{
 9823     int vlen_enc = vector_length_encoding(this);
 9824     BasicType bt = Matcher::vector_element_basic_type(this);
 9825     int opc = this->ideal_Opcode();
 9826     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9827                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9828   %}
 9829   ins_pipe( pipe_slow );
 9830 %}
 9831 
 9832 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9833   match(Set dst (SubVB (Binary dst src2) mask));
 9834   match(Set dst (SubVS (Binary dst src2) mask));
 9835   match(Set dst (SubVI (Binary dst src2) mask));
 9836   match(Set dst (SubVL (Binary dst src2) mask));
 9837   match(Set dst (SubVF (Binary dst src2) mask));
 9838   match(Set dst (SubVD (Binary dst src2) mask));
 9839   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9840   ins_encode %{
 9841     int vlen_enc = vector_length_encoding(this);
 9842     BasicType bt = Matcher::vector_element_basic_type(this);
 9843     int opc = this->ideal_Opcode();
 9844     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9845                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9846   %}
 9847   ins_pipe( pipe_slow );
 9848 %}
 9849 
 9850 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9851   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9852   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9853   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9854   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9855   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9856   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9857   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9858   ins_encode %{
 9859     int vlen_enc = vector_length_encoding(this);
 9860     BasicType bt = Matcher::vector_element_basic_type(this);
 9861     int opc = this->ideal_Opcode();
 9862     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9863                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9864   %}
 9865   ins_pipe( pipe_slow );
 9866 %}
 9867 
 9868 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9869   match(Set dst (MulVS (Binary dst src2) mask));
 9870   match(Set dst (MulVI (Binary dst src2) mask));
 9871   match(Set dst (MulVL (Binary dst src2) mask));
 9872   match(Set dst (MulVF (Binary dst src2) mask));
 9873   match(Set dst (MulVD (Binary dst src2) mask));
 9874   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9875   ins_encode %{
 9876     int vlen_enc = vector_length_encoding(this);
 9877     BasicType bt = Matcher::vector_element_basic_type(this);
 9878     int opc = this->ideal_Opcode();
 9879     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9880                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9881   %}
 9882   ins_pipe( pipe_slow );
 9883 %}
 9884 
 9885 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9886   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9887   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9888   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9889   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9890   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9891   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9892   ins_encode %{
 9893     int vlen_enc = vector_length_encoding(this);
 9894     BasicType bt = Matcher::vector_element_basic_type(this);
 9895     int opc = this->ideal_Opcode();
 9896     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9897                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9898   %}
 9899   ins_pipe( pipe_slow );
 9900 %}
 9901 
 9902 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9903   match(Set dst (SqrtVF dst mask));
 9904   match(Set dst (SqrtVD dst mask));
 9905   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9906   ins_encode %{
 9907     int vlen_enc = vector_length_encoding(this);
 9908     BasicType bt = Matcher::vector_element_basic_type(this);
 9909     int opc = this->ideal_Opcode();
 9910     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9911                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9912   %}
 9913   ins_pipe( pipe_slow );
 9914 %}
 9915 
 9916 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9917   match(Set dst (DivVF (Binary dst src2) mask));
 9918   match(Set dst (DivVD (Binary dst src2) mask));
 9919   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9920   ins_encode %{
 9921     int vlen_enc = vector_length_encoding(this);
 9922     BasicType bt = Matcher::vector_element_basic_type(this);
 9923     int opc = this->ideal_Opcode();
 9924     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9925                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9926   %}
 9927   ins_pipe( pipe_slow );
 9928 %}
 9929 
 9930 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9931   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9932   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9933   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9934   ins_encode %{
 9935     int vlen_enc = vector_length_encoding(this);
 9936     BasicType bt = Matcher::vector_element_basic_type(this);
 9937     int opc = this->ideal_Opcode();
 9938     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9939                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9940   %}
 9941   ins_pipe( pipe_slow );
 9942 %}
 9943 
 9944 
 9945 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9946   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9947   match(Set dst (RotateRightV (Binary dst shift) mask));
 9948   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9949   ins_encode %{
 9950     int vlen_enc = vector_length_encoding(this);
 9951     BasicType bt = Matcher::vector_element_basic_type(this);
 9952     int opc = this->ideal_Opcode();
 9953     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9954                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9955   %}
 9956   ins_pipe( pipe_slow );
 9957 %}
 9958 
 9959 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9960   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9961   match(Set dst (RotateRightV (Binary dst src2) mask));
 9962   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9963   ins_encode %{
 9964     int vlen_enc = vector_length_encoding(this);
 9965     BasicType bt = Matcher::vector_element_basic_type(this);
 9966     int opc = this->ideal_Opcode();
 9967     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9968                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9969   %}
 9970   ins_pipe( pipe_slow );
 9971 %}
 9972 
 9973 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9974   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9975   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9976   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9977   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9978   ins_encode %{
 9979     int vlen_enc = vector_length_encoding(this);
 9980     BasicType bt = Matcher::vector_element_basic_type(this);
 9981     int opc = this->ideal_Opcode();
 9982     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9983                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9984   %}
 9985   ins_pipe( pipe_slow );
 9986 %}
 9987 
 9988 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9989   predicate(!n->as_ShiftV()->is_var_shift());
 9990   match(Set dst (LShiftVS (Binary dst src2) mask));
 9991   match(Set dst (LShiftVI (Binary dst src2) mask));
 9992   match(Set dst (LShiftVL (Binary dst src2) mask));
 9993   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9994   ins_encode %{
 9995     int vlen_enc = vector_length_encoding(this);
 9996     BasicType bt = Matcher::vector_element_basic_type(this);
 9997     int opc = this->ideal_Opcode();
 9998     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9999                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10000   %}
10001   ins_pipe( pipe_slow );
10002 %}
10003 
10004 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10005   predicate(n->as_ShiftV()->is_var_shift());
10006   match(Set dst (LShiftVS (Binary dst src2) mask));
10007   match(Set dst (LShiftVI (Binary dst src2) mask));
10008   match(Set dst (LShiftVL (Binary dst src2) mask));
10009   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10010   ins_encode %{
10011     int vlen_enc = vector_length_encoding(this);
10012     BasicType bt = Matcher::vector_element_basic_type(this);
10013     int opc = this->ideal_Opcode();
10014     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10015                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10016   %}
10017   ins_pipe( pipe_slow );
10018 %}
10019 
10020 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10021   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10022   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10023   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10024   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10025   ins_encode %{
10026     int vlen_enc = vector_length_encoding(this);
10027     BasicType bt = Matcher::vector_element_basic_type(this);
10028     int opc = this->ideal_Opcode();
10029     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10030                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10031   %}
10032   ins_pipe( pipe_slow );
10033 %}
10034 
10035 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10036   predicate(!n->as_ShiftV()->is_var_shift());
10037   match(Set dst (RShiftVS (Binary dst src2) mask));
10038   match(Set dst (RShiftVI (Binary dst src2) mask));
10039   match(Set dst (RShiftVL (Binary dst src2) mask));
10040   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10041   ins_encode %{
10042     int vlen_enc = vector_length_encoding(this);
10043     BasicType bt = Matcher::vector_element_basic_type(this);
10044     int opc = this->ideal_Opcode();
10045     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10046                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10047   %}
10048   ins_pipe( pipe_slow );
10049 %}
10050 
10051 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10052   predicate(n->as_ShiftV()->is_var_shift());
10053   match(Set dst (RShiftVS (Binary dst src2) mask));
10054   match(Set dst (RShiftVI (Binary dst src2) mask));
10055   match(Set dst (RShiftVL (Binary dst src2) mask));
10056   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10057   ins_encode %{
10058     int vlen_enc = vector_length_encoding(this);
10059     BasicType bt = Matcher::vector_element_basic_type(this);
10060     int opc = this->ideal_Opcode();
10061     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10062                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10063   %}
10064   ins_pipe( pipe_slow );
10065 %}
10066 
10067 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10068   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10069   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10070   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10071   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10072   ins_encode %{
10073     int vlen_enc = vector_length_encoding(this);
10074     BasicType bt = Matcher::vector_element_basic_type(this);
10075     int opc = this->ideal_Opcode();
10076     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10077                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10078   %}
10079   ins_pipe( pipe_slow );
10080 %}
10081 
10082 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10083   predicate(!n->as_ShiftV()->is_var_shift());
10084   match(Set dst (URShiftVS (Binary dst src2) mask));
10085   match(Set dst (URShiftVI (Binary dst src2) mask));
10086   match(Set dst (URShiftVL (Binary dst src2) mask));
10087   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10088   ins_encode %{
10089     int vlen_enc = vector_length_encoding(this);
10090     BasicType bt = Matcher::vector_element_basic_type(this);
10091     int opc = this->ideal_Opcode();
10092     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10093                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10094   %}
10095   ins_pipe( pipe_slow );
10096 %}
10097 
10098 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10099   predicate(n->as_ShiftV()->is_var_shift());
10100   match(Set dst (URShiftVS (Binary dst src2) mask));
10101   match(Set dst (URShiftVI (Binary dst src2) mask));
10102   match(Set dst (URShiftVL (Binary dst src2) mask));
10103   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10104   ins_encode %{
10105     int vlen_enc = vector_length_encoding(this);
10106     BasicType bt = Matcher::vector_element_basic_type(this);
10107     int opc = this->ideal_Opcode();
10108     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10109                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10110   %}
10111   ins_pipe( pipe_slow );
10112 %}
10113 
10114 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10115   match(Set dst (MaxV (Binary dst src2) mask));
10116   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10117   ins_encode %{
10118     int vlen_enc = vector_length_encoding(this);
10119     BasicType bt = Matcher::vector_element_basic_type(this);
10120     int opc = this->ideal_Opcode();
10121     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10122                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10123   %}
10124   ins_pipe( pipe_slow );
10125 %}
10126 
10127 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10128   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10129   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10130   ins_encode %{
10131     int vlen_enc = vector_length_encoding(this);
10132     BasicType bt = Matcher::vector_element_basic_type(this);
10133     int opc = this->ideal_Opcode();
10134     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10135                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10136   %}
10137   ins_pipe( pipe_slow );
10138 %}
10139 
10140 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10141   match(Set dst (MinV (Binary dst src2) mask));
10142   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10143   ins_encode %{
10144     int vlen_enc = vector_length_encoding(this);
10145     BasicType bt = Matcher::vector_element_basic_type(this);
10146     int opc = this->ideal_Opcode();
10147     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10148                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10149   %}
10150   ins_pipe( pipe_slow );
10151 %}
10152 
10153 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10154   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10155   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10156   ins_encode %{
10157     int vlen_enc = vector_length_encoding(this);
10158     BasicType bt = Matcher::vector_element_basic_type(this);
10159     int opc = this->ideal_Opcode();
10160     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10161                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10162   %}
10163   ins_pipe( pipe_slow );
10164 %}
10165 
10166 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10167   match(Set dst (VectorRearrange (Binary dst src2) mask));
10168   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10169   ins_encode %{
10170     int vlen_enc = vector_length_encoding(this);
10171     BasicType bt = Matcher::vector_element_basic_type(this);
10172     int opc = this->ideal_Opcode();
10173     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10174                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10175   %}
10176   ins_pipe( pipe_slow );
10177 %}
10178 
10179 instruct vabs_masked(vec dst, kReg mask) %{
10180   match(Set dst (AbsVB dst mask));
10181   match(Set dst (AbsVS dst mask));
10182   match(Set dst (AbsVI dst mask));
10183   match(Set dst (AbsVL dst mask));
10184   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10185   ins_encode %{
10186     int vlen_enc = vector_length_encoding(this);
10187     BasicType bt = Matcher::vector_element_basic_type(this);
10188     int opc = this->ideal_Opcode();
10189     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10190                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10191   %}
10192   ins_pipe( pipe_slow );
10193 %}
10194 
10195 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10196   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10197   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10198   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10199   ins_encode %{
10200     assert(UseFMA, "Needs FMA instructions support.");
10201     int vlen_enc = vector_length_encoding(this);
10202     BasicType bt = Matcher::vector_element_basic_type(this);
10203     int opc = this->ideal_Opcode();
10204     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10205                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10206   %}
10207   ins_pipe( pipe_slow );
10208 %}
10209 
10210 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10211   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10212   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10213   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10214   ins_encode %{
10215     assert(UseFMA, "Needs FMA instructions support.");
10216     int vlen_enc = vector_length_encoding(this);
10217     BasicType bt = Matcher::vector_element_basic_type(this);
10218     int opc = this->ideal_Opcode();
10219     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10220                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10221   %}
10222   ins_pipe( pipe_slow );
10223 %}
10224 
10225 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10226   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10227   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10228   ins_encode %{
10229     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10230     int vlen_enc = vector_length_encoding(this, $src1);
10231     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10232 
10233     // Comparison i
10234     switch (src1_elem_bt) {
10235       case T_BYTE: {
10236         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10237         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10238         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10239         break;
10240       }
10241       case T_SHORT: {
10242         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10243         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10244         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10245         break;
10246       }
10247       case T_INT: {
10248         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10249         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10250         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10251         break;
10252       }
10253       case T_LONG: {
10254         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10255         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10256         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10257         break;
10258       }
10259       case T_FLOAT: {
10260         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10261         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10262         break;
10263       }
10264       case T_DOUBLE: {
10265         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10266         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10267         break;
10268       }
10269       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10270     }
10271   %}
10272   ins_pipe( pipe_slow );
10273 %}
10274 
10275 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10276   predicate(Matcher::vector_length(n) <= 32);
10277   match(Set dst (MaskAll src));
10278   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10279   ins_encode %{
10280     int mask_len = Matcher::vector_length(this);
10281     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10282   %}
10283   ins_pipe( pipe_slow );
10284 %}
10285 
10286 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10287   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10288   match(Set dst (XorVMask src (MaskAll cnt)));
10289   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10290   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10291   ins_encode %{
10292     uint masklen = Matcher::vector_length(this);
10293     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10294   %}
10295   ins_pipe( pipe_slow );
10296 %}
10297 
10298 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10299   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10300             (Matcher::vector_length(n) == 16) ||
10301             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10302   match(Set dst (XorVMask src (MaskAll cnt)));
10303   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10304   ins_encode %{
10305     uint masklen = Matcher::vector_length(this);
10306     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10307   %}
10308   ins_pipe( pipe_slow );
10309 %}
10310 
10311 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10312   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10313   match(Set dst (VectorLongToMask src));
10314   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10315   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10316   ins_encode %{
10317     int mask_len = Matcher::vector_length(this);
10318     int vec_enc  = vector_length_encoding(mask_len);
10319     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10320                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10321   %}
10322   ins_pipe( pipe_slow );
10323 %}
10324 
10325 
10326 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10327   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10328   match(Set dst (VectorLongToMask src));
10329   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10330   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10331   ins_encode %{
10332     int mask_len = Matcher::vector_length(this);
10333     assert(mask_len <= 32, "invalid mask length");
10334     int vec_enc  = vector_length_encoding(mask_len);
10335     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10336                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10337   %}
10338   ins_pipe( pipe_slow );
10339 %}
10340 
10341 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10342   predicate(n->bottom_type()->isa_vectmask());
10343   match(Set dst (VectorLongToMask src));
10344   format %{ "long_to_mask_evex $dst, $src\t!" %}
10345   ins_encode %{
10346     __ kmov($dst$$KRegister, $src$$Register);
10347   %}
10348   ins_pipe( pipe_slow );
10349 %}
10350 
10351 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10352   match(Set dst (AndVMask src1 src2));
10353   match(Set dst (OrVMask src1 src2));
10354   match(Set dst (XorVMask src1 src2));
10355   effect(TEMP kscratch);
10356   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10357   ins_encode %{
10358     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10359     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10360     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10361     uint masklen = Matcher::vector_length(this);
10362     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10363     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10364   %}
10365   ins_pipe( pipe_slow );
10366 %}
10367 
10368 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10369   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10370   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10371   ins_encode %{
10372     int vlen_enc = vector_length_encoding(this);
10373     BasicType bt = Matcher::vector_element_basic_type(this);
10374     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10375                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10376   %}
10377   ins_pipe( pipe_slow );
10378 %}
10379 
10380 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10381   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10382   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10383   ins_encode %{
10384     int vlen_enc = vector_length_encoding(this);
10385     BasicType bt = Matcher::vector_element_basic_type(this);
10386     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10387                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10388   %}
10389   ins_pipe( pipe_slow );
10390 %}
10391 
10392 instruct castMM(kReg dst)
10393 %{
10394   match(Set dst (CastVV dst));
10395 
10396   size(0);
10397   format %{ "# castVV of $dst" %}
10398   ins_encode(/* empty encoding */);
10399   ins_cost(0);
10400   ins_pipe(empty);
10401 %}
10402 
10403 instruct castVV(vec dst)
10404 %{
10405   match(Set dst (CastVV dst));
10406 
10407   size(0);
10408   format %{ "# castVV of $dst" %}
10409   ins_encode(/* empty encoding */);
10410   ins_cost(0);
10411   ins_pipe(empty);
10412 %}
10413 
10414 instruct castVVLeg(legVec dst)
10415 %{
10416   match(Set dst (CastVV dst));
10417 
10418   size(0);
10419   format %{ "# castVV of $dst" %}
10420   ins_encode(/* empty encoding */);
10421   ins_cost(0);
10422   ins_pipe(empty);
10423 %}
10424 
10425 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10426 %{
10427   match(Set dst (IsInfiniteF src));
10428   effect(TEMP ktmp, KILL cr);
10429   format %{ "float_class_check $dst, $src" %}
10430   ins_encode %{
10431     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10432     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10433   %}
10434   ins_pipe(pipe_slow);
10435 %}
10436 
10437 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10438 %{
10439   match(Set dst (IsInfiniteD src));
10440   effect(TEMP ktmp, KILL cr);
10441   format %{ "double_class_check $dst, $src" %}
10442   ins_encode %{
10443     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10444     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10445   %}
10446   ins_pipe(pipe_slow);
10447 %}
10448 
10449 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10450 %{
10451   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10452             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10453   match(Set dst (SaturatingAddV src1 src2));
10454   match(Set dst (SaturatingSubV src1 src2));
10455   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10456   ins_encode %{
10457     int vlen_enc = vector_length_encoding(this);
10458     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10459     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10460                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10461   %}
10462   ins_pipe(pipe_slow);
10463 %}
10464 
10465 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10466 %{
10467   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10468             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10469   match(Set dst (SaturatingAddV src1 src2));
10470   match(Set dst (SaturatingSubV src1 src2));
10471   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10472   ins_encode %{
10473     int vlen_enc = vector_length_encoding(this);
10474     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10475     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10476                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10477   %}
10478   ins_pipe(pipe_slow);
10479 %}
10480 
10481 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10482 %{
10483   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10484             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10485             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10486   match(Set dst (SaturatingAddV src1 src2));
10487   match(Set dst (SaturatingSubV src1 src2));
10488   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10489   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10490   ins_encode %{
10491     int vlen_enc = vector_length_encoding(this);
10492     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10493     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10494                                         $src1$$XMMRegister, $src2$$XMMRegister,
10495                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10496                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10497   %}
10498   ins_pipe(pipe_slow);
10499 %}
10500 
10501 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10502 %{
10503   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10504             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10505             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10506   match(Set dst (SaturatingAddV src1 src2));
10507   match(Set dst (SaturatingSubV src1 src2));
10508   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10509   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10510   ins_encode %{
10511     int vlen_enc = vector_length_encoding(this);
10512     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10513     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10514                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10515                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10516   %}
10517   ins_pipe(pipe_slow);
10518 %}
10519 
10520 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10521 %{
10522   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10523             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10524             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10525   match(Set dst (SaturatingAddV src1 src2));
10526   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10527   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10528   ins_encode %{
10529     int vlen_enc = vector_length_encoding(this);
10530     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10531     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10532                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10533   %}
10534   ins_pipe(pipe_slow);
10535 %}
10536 
10537 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10538 %{
10539   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10540             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10541             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10542   match(Set dst (SaturatingAddV src1 src2));
10543   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10544   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10545   ins_encode %{
10546     int vlen_enc = vector_length_encoding(this);
10547     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10548     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10549                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10550   %}
10551   ins_pipe(pipe_slow);
10552 %}
10553 
10554 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10555 %{
10556   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10557             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10558             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10559   match(Set dst (SaturatingSubV src1 src2));
10560   effect(TEMP ktmp);
10561   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10562   ins_encode %{
10563     int vlen_enc = vector_length_encoding(this);
10564     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10565     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10566                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10567   %}
10568   ins_pipe(pipe_slow);
10569 %}
10570 
10571 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10572 %{
10573   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10574             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10575             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10576   match(Set dst (SaturatingSubV src1 src2));
10577   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10578   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10579   ins_encode %{
10580     int vlen_enc = vector_length_encoding(this);
10581     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10582     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10583                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10584   %}
10585   ins_pipe(pipe_slow);
10586 %}
10587 
10588 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10589 %{
10590   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10591             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10592   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10593   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10594   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10595   ins_encode %{
10596     int vlen_enc = vector_length_encoding(this);
10597     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10598     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10599                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10600   %}
10601   ins_pipe(pipe_slow);
10602 %}
10603 
10604 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10605 %{
10606   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10607             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10608   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10609   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10610   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10611   ins_encode %{
10612     int vlen_enc = vector_length_encoding(this);
10613     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10614     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10615                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10616   %}
10617   ins_pipe(pipe_slow);
10618 %}
10619 
10620 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10621   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10622             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10623   match(Set dst (SaturatingAddV (Binary dst src) mask));
10624   match(Set dst (SaturatingSubV (Binary dst src) mask));
10625   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10626   ins_encode %{
10627     int vlen_enc = vector_length_encoding(this);
10628     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10629     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10630                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10631   %}
10632   ins_pipe( pipe_slow );
10633 %}
10634 
10635 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10636   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10637             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10638   match(Set dst (SaturatingAddV (Binary dst src) mask));
10639   match(Set dst (SaturatingSubV (Binary dst src) mask));
10640   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10641   ins_encode %{
10642     int vlen_enc = vector_length_encoding(this);
10643     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10644     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10645                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10646   %}
10647   ins_pipe( pipe_slow );
10648 %}
10649 
10650 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10651   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10652             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10653   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10654   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10655   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10656   ins_encode %{
10657     int vlen_enc = vector_length_encoding(this);
10658     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10659     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10660                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10661   %}
10662   ins_pipe( pipe_slow );
10663 %}
10664 
10665 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10666   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10667             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10668   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10669   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10670   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10671   ins_encode %{
10672     int vlen_enc = vector_length_encoding(this);
10673     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10674     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10675                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10676   %}
10677   ins_pipe( pipe_slow );
10678 %}
10679 
10680 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10681 %{
10682   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10683   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10684   ins_encode %{
10685     int vlen_enc = vector_length_encoding(this);
10686     BasicType bt = Matcher::vector_element_basic_type(this);
10687     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10688   %}
10689   ins_pipe(pipe_slow);
10690 %}
10691 
10692 instruct reinterpretS2HF(regF dst, rRegI src)
10693 %{
10694   match(Set dst (ReinterpretS2HF src));
10695   format %{ "vmovw $dst, $src" %}
10696   ins_encode %{
10697     __ vmovw($dst$$XMMRegister, $src$$Register);
10698   %}
10699   ins_pipe(pipe_slow);
10700 %}
10701 
10702 instruct reinterpretHF2S(rRegI dst, regF src)
10703 %{
10704   match(Set dst (ReinterpretHF2S src));
10705   format %{ "vmovw $dst, $src" %}
10706   ins_encode %{
10707     __ vmovw($dst$$Register, $src$$XMMRegister);
10708   %}
10709   ins_pipe(pipe_slow);
10710 %}
10711 
10712 instruct convF2HFAndS2HF(regF dst, regF src)
10713 %{
10714   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10715   format %{ "convF2HFAndS2HF $dst, $src" %}
10716   ins_encode %{
10717     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10718   %}
10719   ins_pipe(pipe_slow);
10720 %}
10721 
10722 instruct convHF2SAndHF2F(regF dst, regF src)
10723 %{
10724   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10725   format %{ "convHF2SAndHF2F $dst, $src" %}
10726   ins_encode %{
10727     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10728   %}
10729   ins_pipe(pipe_slow);
10730 %}
10731 
10732 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10733 %{
10734   match(Set dst (SqrtHF src));
10735   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10736   ins_encode %{
10737     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10738   %}
10739   ins_pipe(pipe_slow);
10740 %}
10741 
10742 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10743 %{
10744   match(Set dst (AddHF src1 src2));
10745   match(Set dst (DivHF src1 src2));
10746   match(Set dst (MulHF src1 src2));
10747   match(Set dst (SubHF src1 src2));
10748   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10749   ins_encode %{
10750     int opcode = this->ideal_Opcode();
10751     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10752   %}
10753   ins_pipe(pipe_slow);
10754 %}
10755 
10756 instruct scalar_minmax_HF_avx10_reg(regF dst, regF src1, regF src2)
10757 %{
10758   predicate(VM_Version::supports_avx10_2());
10759   match(Set dst (MaxHF src1 src2));
10760   match(Set dst (MinHF src1 src2));
10761   format %{ "scalar_min_max_fp16 $dst, $src1, $src2" %}
10762   ins_encode %{
10763     int function = this->ideal_Opcode() == Op_MinHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10764     __ eminmaxsh($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, function);
10765   %}
10766   ins_pipe( pipe_slow );
10767 %}
10768 
10769 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10770 %{
10771   predicate(!VM_Version::supports_avx10_2());
10772   match(Set dst (MaxHF src1 src2));
10773   match(Set dst (MinHF src1 src2));
10774   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10775   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10776   ins_encode %{
10777     int opcode = this->ideal_Opcode();
10778     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10779                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10780   %}
10781   ins_pipe( pipe_slow );
10782 %}
10783 
10784 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10785 %{
10786   match(Set dst (FmaHF  src2 (Binary dst src1)));
10787   effect(DEF dst);
10788   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10789   ins_encode %{
10790     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10791   %}
10792   ins_pipe( pipe_slow );
10793 %}
10794 
10795 
10796 instruct vector_sqrt_HF_reg(vec dst, vec src)
10797 %{
10798   match(Set dst (SqrtVHF src));
10799   format %{ "vector_sqrt_fp16 $dst, $src" %}
10800   ins_encode %{
10801     int vlen_enc = vector_length_encoding(this);
10802     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10803   %}
10804   ins_pipe(pipe_slow);
10805 %}
10806 
10807 instruct vector_sqrt_HF_mem(vec dst, memory src)
10808 %{
10809   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10810   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10811   ins_encode %{
10812     int vlen_enc = vector_length_encoding(this);
10813     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10814   %}
10815   ins_pipe(pipe_slow);
10816 %}
10817 
10818 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10819 %{
10820   match(Set dst (AddVHF src1 src2));
10821   match(Set dst (DivVHF src1 src2));
10822   match(Set dst (MulVHF src1 src2));
10823   match(Set dst (SubVHF src1 src2));
10824   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10825   ins_encode %{
10826     int vlen_enc = vector_length_encoding(this);
10827     int opcode = this->ideal_Opcode();
10828     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10829   %}
10830   ins_pipe(pipe_slow);
10831 %}
10832 
10833 
10834 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10835 %{
10836   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10837   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10838   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10839   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10840   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10841   ins_encode %{
10842     int vlen_enc = vector_length_encoding(this);
10843     int opcode = this->ideal_Opcode();
10844     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10845   %}
10846   ins_pipe(pipe_slow);
10847 %}
10848 
10849 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10850 %{
10851   match(Set dst (FmaVHF src2 (Binary dst src1)));
10852   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10853   ins_encode %{
10854     int vlen_enc = vector_length_encoding(this);
10855     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10856   %}
10857   ins_pipe( pipe_slow );
10858 %}
10859 
10860 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10861 %{
10862   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10863   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10864   ins_encode %{
10865     int vlen_enc = vector_length_encoding(this);
10866     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10867   %}
10868   ins_pipe( pipe_slow );
10869 %}
10870 
10871 instruct vector_minmax_HF_avx10_mem(vec dst, vec src1, memory src2)
10872 %{
10873   predicate(VM_Version::supports_avx10_2());
10874   match(Set dst (MinVHF src1 (VectorReinterpret (LoadVector src2))));
10875   match(Set dst (MaxVHF src1 (VectorReinterpret (LoadVector src2))));
10876   format %{ "vector_min_max_fp16_mem $dst, $src1, $src2" %}
10877   ins_encode %{
10878     int vlen_enc = vector_length_encoding(this);
10879     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10880     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$Address, true, function, vlen_enc);
10881   %}
10882   ins_pipe( pipe_slow );
10883 %}
10884 
10885 instruct vector_minmax_HF_avx10_reg(vec dst, vec src1, vec src2)
10886 %{
10887   predicate(VM_Version::supports_avx10_2());
10888   match(Set dst (MinVHF src1 src2));
10889   match(Set dst (MaxVHF src1 src2));
10890   format %{ "vector_min_max_fp16 $dst, $src1, $src2" %}
10891   ins_encode %{
10892     int vlen_enc = vector_length_encoding(this);
10893     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10894     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, true, function, vlen_enc);
10895   %}
10896   ins_pipe( pipe_slow );
10897 %}
10898 
10899 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
10900 %{
10901   predicate(!VM_Version::supports_avx10_2());
10902   match(Set dst (MinVHF src1 src2));
10903   match(Set dst (MaxVHF src1 src2));
10904   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10905   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10906   ins_encode %{
10907     int vlen_enc = vector_length_encoding(this);
10908     int opcode = this->ideal_Opcode();
10909     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
10910                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10911   %}
10912   ins_pipe( pipe_slow );
10913 %}