1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128) {
 1835          return false;
 1836        }
 1837        if ((size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1838          return false;
 1839        }
 1840        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1841          return false;
 1842        }
 1843        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1844          return false;
 1845        }
 1846        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1847          return false;
 1848        }
 1849        break;
 1850     case Op_MaskAll:
 1851       if (!VM_Version::supports_evex()) {
 1852         return false;
 1853       }
 1854       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1855         return false;
 1856       }
 1857       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1858         return false;
 1859       }
 1860       break;
 1861     case Op_VectorMaskCmp:
 1862       if (vlen < 2 || size_in_bits < 32) {
 1863         return false;
 1864       }
 1865       break;
 1866     case Op_CompressM:
 1867       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1868         return false;
 1869       }
 1870       break;
 1871     case Op_CompressV:
 1872     case Op_ExpandV:
 1873       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1874         return false;
 1875       }
 1876       if (size_in_bits < 128 ) {
 1877         return false;
 1878       }
 1879     case Op_VectorLongToMask:
 1880       if (UseAVX < 1) {
 1881         return false;
 1882       }
 1883       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1884         return false;
 1885       }
 1886       break;
 1887     case Op_SignumVD:
 1888     case Op_SignumVF:
 1889       if (UseAVX < 1) {
 1890         return false;
 1891       }
 1892       break;
 1893     case Op_PopCountVI:
 1894     case Op_PopCountVL: {
 1895         if (!is_pop_count_instr_target(bt) &&
 1896             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1897           return false;
 1898         }
 1899       }
 1900       break;
 1901     case Op_ReverseV:
 1902     case Op_ReverseBytesV:
 1903       if (UseAVX < 2) {
 1904         return false;
 1905       }
 1906       break;
 1907     case Op_CountTrailingZerosV:
 1908     case Op_CountLeadingZerosV:
 1909       if (UseAVX < 2) {
 1910         return false;
 1911       }
 1912       break;
 1913   }
 1914   return true;  // Per default match rules are supported.
 1915 }
 1916 
 1917 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1918   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1919   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1920   // of their non-masked counterpart with mask edge being the differentiator.
 1921   // This routine does a strict check on the existence of masked operation patterns
 1922   // by returning a default false value for all the other opcodes apart from the
 1923   // ones whose masked instruction patterns are defined in this file.
 1924   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1925     return false;
 1926   }
 1927 
 1928   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1929   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1930     return false;
 1931   }
 1932   switch(opcode) {
 1933     // Unary masked operations
 1934     case Op_AbsVB:
 1935     case Op_AbsVS:
 1936       if(!VM_Version::supports_avx512bw()) {
 1937         return false;  // Implementation limitation
 1938       }
 1939     case Op_AbsVI:
 1940     case Op_AbsVL:
 1941       return true;
 1942 
 1943     // Ternary masked operations
 1944     case Op_FmaVF:
 1945     case Op_FmaVD:
 1946       return true;
 1947 
 1948     case Op_MacroLogicV:
 1949       if(bt != T_INT && bt != T_LONG) {
 1950         return false;
 1951       }
 1952       return true;
 1953 
 1954     // Binary masked operations
 1955     case Op_AddVB:
 1956     case Op_AddVS:
 1957     case Op_SubVB:
 1958     case Op_SubVS:
 1959     case Op_MulVS:
 1960     case Op_LShiftVS:
 1961     case Op_RShiftVS:
 1962     case Op_URShiftVS:
 1963       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1964       if (!VM_Version::supports_avx512bw()) {
 1965         return false;  // Implementation limitation
 1966       }
 1967       return true;
 1968 
 1969     case Op_MulVL:
 1970       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1971       if (!VM_Version::supports_avx512dq()) {
 1972         return false;  // Implementation limitation
 1973       }
 1974       return true;
 1975 
 1976     case Op_AndV:
 1977     case Op_OrV:
 1978     case Op_XorV:
 1979     case Op_RotateRightV:
 1980     case Op_RotateLeftV:
 1981       if (bt != T_INT && bt != T_LONG) {
 1982         return false; // Implementation limitation
 1983       }
 1984       return true;
 1985 
 1986     case Op_VectorLoadMask:
 1987       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1988       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1989         return false;
 1990       }
 1991       return true;
 1992 
 1993     case Op_AddVI:
 1994     case Op_AddVL:
 1995     case Op_AddVF:
 1996     case Op_AddVD:
 1997     case Op_SubVI:
 1998     case Op_SubVL:
 1999     case Op_SubVF:
 2000     case Op_SubVD:
 2001     case Op_MulVI:
 2002     case Op_MulVF:
 2003     case Op_MulVD:
 2004     case Op_DivVF:
 2005     case Op_DivVD:
 2006     case Op_SqrtVF:
 2007     case Op_SqrtVD:
 2008     case Op_LShiftVI:
 2009     case Op_LShiftVL:
 2010     case Op_RShiftVI:
 2011     case Op_RShiftVL:
 2012     case Op_URShiftVI:
 2013     case Op_URShiftVL:
 2014     case Op_LoadVectorMasked:
 2015     case Op_StoreVectorMasked:
 2016     case Op_LoadVectorGatherMasked:
 2017     case Op_StoreVectorScatterMasked:
 2018       return true;
 2019 
 2020     case Op_UMinV:
 2021     case Op_UMaxV:
 2022       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2023         return false;
 2024       } // fallthrough
 2025     case Op_MaxV:
 2026     case Op_MinV:
 2027       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2028         return false; // Implementation limitation
 2029       }
 2030       if (is_floating_point_type(bt) && !VM_Version::supports_avx10_2()) {
 2031         return false; // Implementation limitation
 2032       }
 2033       return true;
 2034     case Op_SaturatingAddV:
 2035     case Op_SaturatingSubV:
 2036       if (!is_subword_type(bt)) {
 2037         return false;
 2038       }
 2039       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2040         return false; // Implementation limitation
 2041       }
 2042       return true;
 2043 
 2044     case Op_VectorMaskCmp:
 2045       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2046         return false; // Implementation limitation
 2047       }
 2048       return true;
 2049 
 2050     case Op_VectorRearrange:
 2051       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2052         return false; // Implementation limitation
 2053       }
 2054       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2055         return false; // Implementation limitation
 2056       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2057         return false; // Implementation limitation
 2058       }
 2059       return true;
 2060 
 2061     // Binary Logical operations
 2062     case Op_AndVMask:
 2063     case Op_OrVMask:
 2064     case Op_XorVMask:
 2065       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2066         return false; // Implementation limitation
 2067       }
 2068       return true;
 2069 
 2070     case Op_PopCountVI:
 2071     case Op_PopCountVL:
 2072       if (!is_pop_count_instr_target(bt)) {
 2073         return false;
 2074       }
 2075       return true;
 2076 
 2077     case Op_MaskAll:
 2078       return true;
 2079 
 2080     case Op_CountLeadingZerosV:
 2081       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2082         return true;
 2083       }
 2084     default:
 2085       return false;
 2086   }
 2087 }
 2088 
 2089 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2090   return false;
 2091 }
 2092 
 2093 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2094 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2095   switch (elem_bt) {
 2096     case T_BYTE:  return false;
 2097     case T_SHORT: return !VM_Version::supports_avx512bw();
 2098     case T_INT:   return !VM_Version::supports_avx();
 2099     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2100     default:
 2101       ShouldNotReachHere();
 2102       return false;
 2103   }
 2104 }
 2105 
 2106 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2107   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2108   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2109   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2110       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2111     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2112     return new legVecZOper();
 2113   }
 2114   if (legacy) {
 2115     switch (ideal_reg) {
 2116       case Op_VecS: return new legVecSOper();
 2117       case Op_VecD: return new legVecDOper();
 2118       case Op_VecX: return new legVecXOper();
 2119       case Op_VecY: return new legVecYOper();
 2120       case Op_VecZ: return new legVecZOper();
 2121     }
 2122   } else {
 2123     switch (ideal_reg) {
 2124       case Op_VecS: return new vecSOper();
 2125       case Op_VecD: return new vecDOper();
 2126       case Op_VecX: return new vecXOper();
 2127       case Op_VecY: return new vecYOper();
 2128       case Op_VecZ: return new vecZOper();
 2129     }
 2130   }
 2131   ShouldNotReachHere();
 2132   return nullptr;
 2133 }
 2134 
 2135 bool Matcher::is_reg2reg_move(MachNode* m) {
 2136   switch (m->rule()) {
 2137     case MoveVec2Leg_rule:
 2138     case MoveLeg2Vec_rule:
 2139     case MoveF2VL_rule:
 2140     case MoveF2LEG_rule:
 2141     case MoveVL2F_rule:
 2142     case MoveLEG2F_rule:
 2143     case MoveD2VL_rule:
 2144     case MoveD2LEG_rule:
 2145     case MoveVL2D_rule:
 2146     case MoveLEG2D_rule:
 2147       return true;
 2148     default:
 2149       return false;
 2150   }
 2151 }
 2152 
 2153 bool Matcher::is_generic_vector(MachOper* opnd) {
 2154   switch (opnd->opcode()) {
 2155     case VEC:
 2156     case LEGVEC:
 2157       return true;
 2158     default:
 2159       return false;
 2160   }
 2161 }
 2162 
 2163 //------------------------------------------------------------------------
 2164 
 2165 const RegMask* Matcher::predicate_reg_mask(void) {
 2166   return &_VECTMASK_REG_mask;
 2167 }
 2168 
 2169 // Max vector size in bytes. 0 if not supported.
 2170 int Matcher::vector_width_in_bytes(BasicType bt) {
 2171   assert(is_java_primitive(bt), "only primitive type vectors");
 2172   // SSE2 supports 128bit vectors for all types.
 2173   // AVX2 supports 256bit vectors for all types.
 2174   // AVX2/EVEX supports 512bit vectors for all types.
 2175   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2176   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2177   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2178     size = (UseAVX > 2) ? 64 : 32;
 2179   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2180     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2181   // Use flag to limit vector size.
 2182   size = MIN2(size,(int)MaxVectorSize);
 2183   // Minimum 2 values in vector (or 4 for bytes).
 2184   switch (bt) {
 2185   case T_DOUBLE:
 2186   case T_LONG:
 2187     if (size < 16) return 0;
 2188     break;
 2189   case T_FLOAT:
 2190   case T_INT:
 2191     if (size < 8) return 0;
 2192     break;
 2193   case T_BOOLEAN:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_CHAR:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_BYTE:
 2200     if (size < 4) return 0;
 2201     break;
 2202   case T_SHORT:
 2203     if (size < 4) return 0;
 2204     break;
 2205   default:
 2206     ShouldNotReachHere();
 2207   }
 2208   return size;
 2209 }
 2210 
 2211 // Limits on vector size (number of elements) loaded into vector.
 2212 int Matcher::max_vector_size(const BasicType bt) {
 2213   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2214 }
 2215 int Matcher::min_vector_size(const BasicType bt) {
 2216   int max_size = max_vector_size(bt);
 2217   // Min size which can be loaded into vector is 4 bytes.
 2218   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2219   // Support for calling svml double64 vectors
 2220   if (bt == T_DOUBLE) {
 2221     size = 1;
 2222   }
 2223   return MIN2(size,max_size);
 2224 }
 2225 
 2226 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2227   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2228   // by default on Cascade Lake
 2229   if (VM_Version::is_default_intel_cascade_lake()) {
 2230     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2231   }
 2232   return Matcher::max_vector_size(bt);
 2233 }
 2234 
 2235 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2236   return -1;
 2237 }
 2238 
 2239 // Vector ideal reg corresponding to specified size in bytes
 2240 uint Matcher::vector_ideal_reg(int size) {
 2241   assert(MaxVectorSize >= size, "");
 2242   switch(size) {
 2243     case  4: return Op_VecS;
 2244     case  8: return Op_VecD;
 2245     case 16: return Op_VecX;
 2246     case 32: return Op_VecY;
 2247     case 64: return Op_VecZ;
 2248   }
 2249   ShouldNotReachHere();
 2250   return 0;
 2251 }
 2252 
 2253 // Check for shift by small constant as well
 2254 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2255   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2256       shift->in(2)->get_int() <= 3 &&
 2257       // Are there other uses besides address expressions?
 2258       !matcher->is_visited(shift)) {
 2259     address_visited.set(shift->_idx); // Flag as address_visited
 2260     mstack.push(shift->in(2), Matcher::Visit);
 2261     Node *conv = shift->in(1);
 2262     // Allow Matcher to match the rule which bypass
 2263     // ConvI2L operation for an array index on LP64
 2264     // if the index value is positive.
 2265     if (conv->Opcode() == Op_ConvI2L &&
 2266         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2267         // Are there other uses besides address expressions?
 2268         !matcher->is_visited(conv)) {
 2269       address_visited.set(conv->_idx); // Flag as address_visited
 2270       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2271     } else {
 2272       mstack.push(conv, Matcher::Pre_Visit);
 2273     }
 2274     return true;
 2275   }
 2276   return false;
 2277 }
 2278 
 2279 // This function identifies sub-graphs in which a 'load' node is
 2280 // input to two different nodes, and such that it can be matched
 2281 // with BMI instructions like blsi, blsr, etc.
 2282 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2283 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2284 // refers to the same node.
 2285 //
 2286 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2287 // This is a temporary solution until we make DAGs expressible in ADL.
 2288 template<typename ConType>
 2289 class FusedPatternMatcher {
 2290   Node* _op1_node;
 2291   Node* _mop_node;
 2292   int _con_op;
 2293 
 2294   static int match_next(Node* n, int next_op, int next_op_idx) {
 2295     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2296       return -1;
 2297     }
 2298 
 2299     if (next_op_idx == -1) { // n is commutative, try rotations
 2300       if (n->in(1)->Opcode() == next_op) {
 2301         return 1;
 2302       } else if (n->in(2)->Opcode() == next_op) {
 2303         return 2;
 2304       }
 2305     } else {
 2306       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2307       if (n->in(next_op_idx)->Opcode() == next_op) {
 2308         return next_op_idx;
 2309       }
 2310     }
 2311     return -1;
 2312   }
 2313 
 2314  public:
 2315   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2316     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2317 
 2318   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2319              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2320              typename ConType::NativeType con_value) {
 2321     if (_op1_node->Opcode() != op1) {
 2322       return false;
 2323     }
 2324     if (_mop_node->outcnt() > 2) {
 2325       return false;
 2326     }
 2327     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2328     if (op1_op2_idx == -1) {
 2329       return false;
 2330     }
 2331     // Memory operation must be the other edge
 2332     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2333 
 2334     // Check that the mop node is really what we want
 2335     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2336       Node* op2_node = _op1_node->in(op1_op2_idx);
 2337       if (op2_node->outcnt() > 1) {
 2338         return false;
 2339       }
 2340       assert(op2_node->Opcode() == op2, "Should be");
 2341       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2342       if (op2_con_idx == -1) {
 2343         return false;
 2344       }
 2345       // Memory operation must be the other edge
 2346       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2347       // Check that the memory operation is the same node
 2348       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2349         // Now check the constant
 2350         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2351         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2352           return true;
 2353         }
 2354       }
 2355     }
 2356     return false;
 2357   }
 2358 };
 2359 
 2360 static bool is_bmi_pattern(Node* n, Node* m) {
 2361   assert(UseBMI1Instructions, "sanity");
 2362   if (n != nullptr && m != nullptr) {
 2363     if (m->Opcode() == Op_LoadI) {
 2364       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2365       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2366              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2367              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2368     } else if (m->Opcode() == Op_LoadL) {
 2369       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2370       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2371              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2372              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2373     }
 2374   }
 2375   return false;
 2376 }
 2377 
 2378 // Should the matcher clone input 'm' of node 'n'?
 2379 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2380   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2381   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2382     mstack.push(m, Visit);
 2383     return true;
 2384   }
 2385   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2386     mstack.push(m, Visit);           // m = ShiftCntV
 2387     return true;
 2388   }
 2389   if (is_encode_and_store_pattern(n, m)) {
 2390     mstack.push(m, Visit);
 2391     return true;
 2392   }
 2393   return false;
 2394 }
 2395 
 2396 // Should the Matcher clone shifts on addressing modes, expecting them
 2397 // to be subsumed into complex addressing expressions or compute them
 2398 // into registers?
 2399 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2400   Node *off = m->in(AddPNode::Offset);
 2401   if (off->is_Con()) {
 2402     address_visited.test_set(m->_idx); // Flag as address_visited
 2403     Node *adr = m->in(AddPNode::Address);
 2404 
 2405     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2406     // AtomicAdd is not an addressing expression.
 2407     // Cheap to find it by looking for screwy base.
 2408     if (adr->is_AddP() &&
 2409         !adr->in(AddPNode::Base)->is_top() &&
 2410         !adr->in(AddPNode::Offset)->is_Con() &&
 2411         off->get_long() == (int) (off->get_long()) && // immL32
 2412         // Are there other uses besides address expressions?
 2413         !is_visited(adr)) {
 2414       address_visited.set(adr->_idx); // Flag as address_visited
 2415       Node *shift = adr->in(AddPNode::Offset);
 2416       if (!clone_shift(shift, this, mstack, address_visited)) {
 2417         mstack.push(shift, Pre_Visit);
 2418       }
 2419       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2420       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2421     } else {
 2422       mstack.push(adr, Pre_Visit);
 2423     }
 2424 
 2425     // Clone X+offset as it also folds into most addressing expressions
 2426     mstack.push(off, Visit);
 2427     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2428     return true;
 2429   } else if (clone_shift(off, this, mstack, address_visited)) {
 2430     address_visited.test_set(m->_idx); // Flag as address_visited
 2431     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2432     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2433     return true;
 2434   }
 2435   return false;
 2436 }
 2437 
 2438 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2439   switch (bt) {
 2440     case BoolTest::eq:
 2441       return Assembler::eq;
 2442     case BoolTest::ne:
 2443       return Assembler::neq;
 2444     case BoolTest::le:
 2445     case BoolTest::ule:
 2446       return Assembler::le;
 2447     case BoolTest::ge:
 2448     case BoolTest::uge:
 2449       return Assembler::nlt;
 2450     case BoolTest::lt:
 2451     case BoolTest::ult:
 2452       return Assembler::lt;
 2453     case BoolTest::gt:
 2454     case BoolTest::ugt:
 2455       return Assembler::nle;
 2456     default : ShouldNotReachHere(); return Assembler::_false;
 2457   }
 2458 }
 2459 
 2460 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2461   switch (bt) {
 2462   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2463   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2464   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2465   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2466   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2467   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2468   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2469   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2470   }
 2471 }
 2472 
 2473 // Helper methods for MachSpillCopyNode::implementation().
 2474 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2475                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2476   assert(ireg == Op_VecS || // 32bit vector
 2477          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2478           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2479          "no non-adjacent vector moves" );
 2480   if (masm) {
 2481     switch (ireg) {
 2482     case Op_VecS: // copy whole register
 2483     case Op_VecD:
 2484     case Op_VecX:
 2485       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2486         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2487       } else {
 2488         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2489      }
 2490       break;
 2491     case Op_VecY:
 2492       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2493         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2494       } else {
 2495         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2496      }
 2497       break;
 2498     case Op_VecZ:
 2499       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2500       break;
 2501     default:
 2502       ShouldNotReachHere();
 2503     }
 2504 #ifndef PRODUCT
 2505   } else {
 2506     switch (ireg) {
 2507     case Op_VecS:
 2508     case Op_VecD:
 2509     case Op_VecX:
 2510       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2511       break;
 2512     case Op_VecY:
 2513     case Op_VecZ:
 2514       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2515       break;
 2516     default:
 2517       ShouldNotReachHere();
 2518     }
 2519 #endif
 2520   }
 2521 }
 2522 
 2523 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2524                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2525   if (masm) {
 2526     if (is_load) {
 2527       switch (ireg) {
 2528       case Op_VecS:
 2529         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecD:
 2532         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2533         break;
 2534       case Op_VecX:
 2535         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2536           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2537         } else {
 2538           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2539           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2540         }
 2541         break;
 2542       case Op_VecY:
 2543         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2544           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2545         } else {
 2546           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2547           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2548         }
 2549         break;
 2550       case Op_VecZ:
 2551         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2552         break;
 2553       default:
 2554         ShouldNotReachHere();
 2555       }
 2556     } else { // store
 2557       switch (ireg) {
 2558       case Op_VecS:
 2559         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecD:
 2562         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2563         break;
 2564       case Op_VecX:
 2565         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2566           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2567         }
 2568         else {
 2569           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2570         }
 2571         break;
 2572       case Op_VecY:
 2573         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2574           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2575         }
 2576         else {
 2577           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2578         }
 2579         break;
 2580       case Op_VecZ:
 2581         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2582         break;
 2583       default:
 2584         ShouldNotReachHere();
 2585       }
 2586     }
 2587 #ifndef PRODUCT
 2588   } else {
 2589     if (is_load) {
 2590       switch (ireg) {
 2591       case Op_VecS:
 2592         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594       case Op_VecD:
 2595         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597        case Op_VecX:
 2598         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2599         break;
 2600       case Op_VecY:
 2601       case Op_VecZ:
 2602         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2603         break;
 2604       default:
 2605         ShouldNotReachHere();
 2606       }
 2607     } else { // store
 2608       switch (ireg) {
 2609       case Op_VecS:
 2610         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612       case Op_VecD:
 2613         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615        case Op_VecX:
 2616         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2617         break;
 2618       case Op_VecY:
 2619       case Op_VecZ:
 2620         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2621         break;
 2622       default:
 2623         ShouldNotReachHere();
 2624       }
 2625     }
 2626 #endif
 2627   }
 2628 }
 2629 
 2630 template <class T>
 2631 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2632   int size = type2aelembytes(bt) * len;
 2633   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2634   for (int i = 0; i < len; i++) {
 2635     int offset = i * type2aelembytes(bt);
 2636     switch (bt) {
 2637       case T_BYTE: val->at(i) = con; break;
 2638       case T_SHORT: {
 2639         jshort c = con;
 2640         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2641         break;
 2642       }
 2643       case T_INT: {
 2644         jint c = con;
 2645         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2646         break;
 2647       }
 2648       case T_LONG: {
 2649         jlong c = con;
 2650         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2651         break;
 2652       }
 2653       case T_FLOAT: {
 2654         jfloat c = con;
 2655         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2656         break;
 2657       }
 2658       case T_DOUBLE: {
 2659         jdouble c = con;
 2660         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2661         break;
 2662       }
 2663       default: assert(false, "%s", type2name(bt));
 2664     }
 2665   }
 2666   return val;
 2667 }
 2668 
 2669 static inline jlong high_bit_set(BasicType bt) {
 2670   switch (bt) {
 2671     case T_BYTE:  return 0x8080808080808080;
 2672     case T_SHORT: return 0x8000800080008000;
 2673     case T_INT:   return 0x8000000080000000;
 2674     case T_LONG:  return 0x8000000000000000;
 2675     default:
 2676       ShouldNotReachHere();
 2677       return 0;
 2678   }
 2679 }
 2680 
 2681 #ifndef PRODUCT
 2682   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2683     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2684   }
 2685 #endif
 2686 
 2687   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2688     __ nop(_count);
 2689   }
 2690 
 2691   uint MachNopNode::size(PhaseRegAlloc*) const {
 2692     return _count;
 2693   }
 2694 
 2695 #ifndef PRODUCT
 2696   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2697     st->print("# breakpoint");
 2698   }
 2699 #endif
 2700 
 2701   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2702     __ int3();
 2703   }
 2704 
 2705   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2706     return MachNode::size(ra_);
 2707   }
 2708 
 2709 %}
 2710 
 2711 encode %{
 2712 
 2713   enc_class call_epilog %{
 2714     if (VerifyStackAtCalls) {
 2715       // Check that stack depth is unchanged: find majik cookie on stack
 2716       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2717       Label L;
 2718       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2719       __ jccb(Assembler::equal, L);
 2720       // Die if stack mismatch
 2721       __ int3();
 2722       __ bind(L);
 2723     }
 2724   %}
 2725 
 2726 %}
 2727 
 2728 // Operands for bound floating pointer register arguments
 2729 operand rxmm0() %{
 2730   constraint(ALLOC_IN_RC(xmm0_reg));
 2731   match(VecX);
 2732   format%{%}
 2733   interface(REG_INTER);
 2734 %}
 2735 
 2736 //----------OPERANDS-----------------------------------------------------------
 2737 // Operand definitions must precede instruction definitions for correct parsing
 2738 // in the ADLC because operands constitute user defined types which are used in
 2739 // instruction definitions.
 2740 
 2741 // Vectors
 2742 
 2743 // Dummy generic vector class. Should be used for all vector operands.
 2744 // Replaced with vec[SDXYZ] during post-selection pass.
 2745 operand vec() %{
 2746   constraint(ALLOC_IN_RC(dynamic));
 2747   match(VecX);
 2748   match(VecY);
 2749   match(VecZ);
 2750   match(VecS);
 2751   match(VecD);
 2752 
 2753   format %{ %}
 2754   interface(REG_INTER);
 2755 %}
 2756 
 2757 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2758 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2759 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2760 // runtime code generation via reg_class_dynamic.
 2761 operand legVec() %{
 2762   constraint(ALLOC_IN_RC(dynamic));
 2763   match(VecX);
 2764   match(VecY);
 2765   match(VecZ);
 2766   match(VecS);
 2767   match(VecD);
 2768 
 2769   format %{ %}
 2770   interface(REG_INTER);
 2771 %}
 2772 
 2773 // Replaces vec during post-selection cleanup. See above.
 2774 operand vecS() %{
 2775   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2776   match(VecS);
 2777 
 2778   format %{ %}
 2779   interface(REG_INTER);
 2780 %}
 2781 
 2782 // Replaces legVec during post-selection cleanup. See above.
 2783 operand legVecS() %{
 2784   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2785   match(VecS);
 2786 
 2787   format %{ %}
 2788   interface(REG_INTER);
 2789 %}
 2790 
 2791 // Replaces vec during post-selection cleanup. See above.
 2792 operand vecD() %{
 2793   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2794   match(VecD);
 2795 
 2796   format %{ %}
 2797   interface(REG_INTER);
 2798 %}
 2799 
 2800 // Replaces legVec during post-selection cleanup. See above.
 2801 operand legVecD() %{
 2802   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2803   match(VecD);
 2804 
 2805   format %{ %}
 2806   interface(REG_INTER);
 2807 %}
 2808 
 2809 // Replaces vec during post-selection cleanup. See above.
 2810 operand vecX() %{
 2811   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2812   match(VecX);
 2813 
 2814   format %{ %}
 2815   interface(REG_INTER);
 2816 %}
 2817 
 2818 // Replaces legVec during post-selection cleanup. See above.
 2819 operand legVecX() %{
 2820   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2821   match(VecX);
 2822 
 2823   format %{ %}
 2824   interface(REG_INTER);
 2825 %}
 2826 
 2827 // Replaces vec during post-selection cleanup. See above.
 2828 operand vecY() %{
 2829   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2830   match(VecY);
 2831 
 2832   format %{ %}
 2833   interface(REG_INTER);
 2834 %}
 2835 
 2836 // Replaces legVec during post-selection cleanup. See above.
 2837 operand legVecY() %{
 2838   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2839   match(VecY);
 2840 
 2841   format %{ %}
 2842   interface(REG_INTER);
 2843 %}
 2844 
 2845 // Replaces vec during post-selection cleanup. See above.
 2846 operand vecZ() %{
 2847   constraint(ALLOC_IN_RC(vectorz_reg));
 2848   match(VecZ);
 2849 
 2850   format %{ %}
 2851   interface(REG_INTER);
 2852 %}
 2853 
 2854 // Replaces legVec during post-selection cleanup. See above.
 2855 operand legVecZ() %{
 2856   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2857   match(VecZ);
 2858 
 2859   format %{ %}
 2860   interface(REG_INTER);
 2861 %}
 2862 
 2863 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2864 
 2865 // ============================================================================
 2866 
 2867 instruct ShouldNotReachHere() %{
 2868   match(Halt);
 2869   format %{ "stop\t# ShouldNotReachHere" %}
 2870   ins_encode %{
 2871     if (is_reachable()) {
 2872       const char* str = __ code_string(_halt_reason);
 2873       __ stop(str);
 2874     }
 2875   %}
 2876   ins_pipe(pipe_slow);
 2877 %}
 2878 
 2879 // ============================================================================
 2880 
 2881 instruct addF_reg(regF dst, regF src) %{
 2882   predicate(UseAVX == 0);
 2883   match(Set dst (AddF dst src));
 2884 
 2885   format %{ "addss   $dst, $src" %}
 2886   ins_cost(150);
 2887   ins_encode %{
 2888     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2889   %}
 2890   ins_pipe(pipe_slow);
 2891 %}
 2892 
 2893 instruct addF_mem(regF dst, memory src) %{
 2894   predicate(UseAVX == 0);
 2895   match(Set dst (AddF dst (LoadF src)));
 2896 
 2897   format %{ "addss   $dst, $src" %}
 2898   ins_cost(150);
 2899   ins_encode %{
 2900     __ addss($dst$$XMMRegister, $src$$Address);
 2901   %}
 2902   ins_pipe(pipe_slow);
 2903 %}
 2904 
 2905 instruct addF_imm(regF dst, immF con) %{
 2906   predicate(UseAVX == 0);
 2907   match(Set dst (AddF dst con));
 2908   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2909   ins_cost(150);
 2910   ins_encode %{
 2911     __ addss($dst$$XMMRegister, $constantaddress($con));
 2912   %}
 2913   ins_pipe(pipe_slow);
 2914 %}
 2915 
 2916 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2917   predicate(UseAVX > 0);
 2918   match(Set dst (AddF src1 src2));
 2919 
 2920   format %{ "vaddss  $dst, $src1, $src2" %}
 2921   ins_cost(150);
 2922   ins_encode %{
 2923     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2924   %}
 2925   ins_pipe(pipe_slow);
 2926 %}
 2927 
 2928 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2929   predicate(UseAVX > 0);
 2930   match(Set dst (AddF src1 (LoadF src2)));
 2931 
 2932   format %{ "vaddss  $dst, $src1, $src2" %}
 2933   ins_cost(150);
 2934   ins_encode %{
 2935     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2936   %}
 2937   ins_pipe(pipe_slow);
 2938 %}
 2939 
 2940 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2941   predicate(UseAVX > 0);
 2942   match(Set dst (AddF src con));
 2943 
 2944   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2945   ins_cost(150);
 2946   ins_encode %{
 2947     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2948   %}
 2949   ins_pipe(pipe_slow);
 2950 %}
 2951 
 2952 instruct addD_reg(regD dst, regD src) %{
 2953   predicate(UseAVX == 0);
 2954   match(Set dst (AddD dst src));
 2955 
 2956   format %{ "addsd   $dst, $src" %}
 2957   ins_cost(150);
 2958   ins_encode %{
 2959     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2960   %}
 2961   ins_pipe(pipe_slow);
 2962 %}
 2963 
 2964 instruct addD_mem(regD dst, memory src) %{
 2965   predicate(UseAVX == 0);
 2966   match(Set dst (AddD dst (LoadD src)));
 2967 
 2968   format %{ "addsd   $dst, $src" %}
 2969   ins_cost(150);
 2970   ins_encode %{
 2971     __ addsd($dst$$XMMRegister, $src$$Address);
 2972   %}
 2973   ins_pipe(pipe_slow);
 2974 %}
 2975 
 2976 instruct addD_imm(regD dst, immD con) %{
 2977   predicate(UseAVX == 0);
 2978   match(Set dst (AddD dst con));
 2979   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 2980   ins_cost(150);
 2981   ins_encode %{
 2982     __ addsd($dst$$XMMRegister, $constantaddress($con));
 2983   %}
 2984   ins_pipe(pipe_slow);
 2985 %}
 2986 
 2987 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 2988   predicate(UseAVX > 0);
 2989   match(Set dst (AddD src1 src2));
 2990 
 2991   format %{ "vaddsd  $dst, $src1, $src2" %}
 2992   ins_cost(150);
 2993   ins_encode %{
 2994     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2995   %}
 2996   ins_pipe(pipe_slow);
 2997 %}
 2998 
 2999 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3000   predicate(UseAVX > 0);
 3001   match(Set dst (AddD src1 (LoadD src2)));
 3002 
 3003   format %{ "vaddsd  $dst, $src1, $src2" %}
 3004   ins_cost(150);
 3005   ins_encode %{
 3006     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3007   %}
 3008   ins_pipe(pipe_slow);
 3009 %}
 3010 
 3011 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3012   predicate(UseAVX > 0);
 3013   match(Set dst (AddD src con));
 3014 
 3015   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3016   ins_cost(150);
 3017   ins_encode %{
 3018     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3019   %}
 3020   ins_pipe(pipe_slow);
 3021 %}
 3022 
 3023 instruct subF_reg(regF dst, regF src) %{
 3024   predicate(UseAVX == 0);
 3025   match(Set dst (SubF dst src));
 3026 
 3027   format %{ "subss   $dst, $src" %}
 3028   ins_cost(150);
 3029   ins_encode %{
 3030     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3031   %}
 3032   ins_pipe(pipe_slow);
 3033 %}
 3034 
 3035 instruct subF_mem(regF dst, memory src) %{
 3036   predicate(UseAVX == 0);
 3037   match(Set dst (SubF dst (LoadF src)));
 3038 
 3039   format %{ "subss   $dst, $src" %}
 3040   ins_cost(150);
 3041   ins_encode %{
 3042     __ subss($dst$$XMMRegister, $src$$Address);
 3043   %}
 3044   ins_pipe(pipe_slow);
 3045 %}
 3046 
 3047 instruct subF_imm(regF dst, immF con) %{
 3048   predicate(UseAVX == 0);
 3049   match(Set dst (SubF dst con));
 3050   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3051   ins_cost(150);
 3052   ins_encode %{
 3053     __ subss($dst$$XMMRegister, $constantaddress($con));
 3054   %}
 3055   ins_pipe(pipe_slow);
 3056 %}
 3057 
 3058 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3059   predicate(UseAVX > 0);
 3060   match(Set dst (SubF src1 src2));
 3061 
 3062   format %{ "vsubss  $dst, $src1, $src2" %}
 3063   ins_cost(150);
 3064   ins_encode %{
 3065     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3066   %}
 3067   ins_pipe(pipe_slow);
 3068 %}
 3069 
 3070 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3071   predicate(UseAVX > 0);
 3072   match(Set dst (SubF src1 (LoadF src2)));
 3073 
 3074   format %{ "vsubss  $dst, $src1, $src2" %}
 3075   ins_cost(150);
 3076   ins_encode %{
 3077     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3078   %}
 3079   ins_pipe(pipe_slow);
 3080 %}
 3081 
 3082 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3083   predicate(UseAVX > 0);
 3084   match(Set dst (SubF src con));
 3085 
 3086   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3087   ins_cost(150);
 3088   ins_encode %{
 3089     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3090   %}
 3091   ins_pipe(pipe_slow);
 3092 %}
 3093 
 3094 instruct subD_reg(regD dst, regD src) %{
 3095   predicate(UseAVX == 0);
 3096   match(Set dst (SubD dst src));
 3097 
 3098   format %{ "subsd   $dst, $src" %}
 3099   ins_cost(150);
 3100   ins_encode %{
 3101     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3102   %}
 3103   ins_pipe(pipe_slow);
 3104 %}
 3105 
 3106 instruct subD_mem(regD dst, memory src) %{
 3107   predicate(UseAVX == 0);
 3108   match(Set dst (SubD dst (LoadD src)));
 3109 
 3110   format %{ "subsd   $dst, $src" %}
 3111   ins_cost(150);
 3112   ins_encode %{
 3113     __ subsd($dst$$XMMRegister, $src$$Address);
 3114   %}
 3115   ins_pipe(pipe_slow);
 3116 %}
 3117 
 3118 instruct subD_imm(regD dst, immD con) %{
 3119   predicate(UseAVX == 0);
 3120   match(Set dst (SubD dst con));
 3121   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3122   ins_cost(150);
 3123   ins_encode %{
 3124     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3125   %}
 3126   ins_pipe(pipe_slow);
 3127 %}
 3128 
 3129 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3130   predicate(UseAVX > 0);
 3131   match(Set dst (SubD src1 src2));
 3132 
 3133   format %{ "vsubsd  $dst, $src1, $src2" %}
 3134   ins_cost(150);
 3135   ins_encode %{
 3136     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3137   %}
 3138   ins_pipe(pipe_slow);
 3139 %}
 3140 
 3141 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3142   predicate(UseAVX > 0);
 3143   match(Set dst (SubD src1 (LoadD src2)));
 3144 
 3145   format %{ "vsubsd  $dst, $src1, $src2" %}
 3146   ins_cost(150);
 3147   ins_encode %{
 3148     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3149   %}
 3150   ins_pipe(pipe_slow);
 3151 %}
 3152 
 3153 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3154   predicate(UseAVX > 0);
 3155   match(Set dst (SubD src con));
 3156 
 3157   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3158   ins_cost(150);
 3159   ins_encode %{
 3160     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3161   %}
 3162   ins_pipe(pipe_slow);
 3163 %}
 3164 
 3165 instruct mulF_reg(regF dst, regF src) %{
 3166   predicate(UseAVX == 0);
 3167   match(Set dst (MulF dst src));
 3168 
 3169   format %{ "mulss   $dst, $src" %}
 3170   ins_cost(150);
 3171   ins_encode %{
 3172     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3173   %}
 3174   ins_pipe(pipe_slow);
 3175 %}
 3176 
 3177 instruct mulF_mem(regF dst, memory src) %{
 3178   predicate(UseAVX == 0);
 3179   match(Set dst (MulF dst (LoadF src)));
 3180 
 3181   format %{ "mulss   $dst, $src" %}
 3182   ins_cost(150);
 3183   ins_encode %{
 3184     __ mulss($dst$$XMMRegister, $src$$Address);
 3185   %}
 3186   ins_pipe(pipe_slow);
 3187 %}
 3188 
 3189 instruct mulF_imm(regF dst, immF con) %{
 3190   predicate(UseAVX == 0);
 3191   match(Set dst (MulF dst con));
 3192   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3193   ins_cost(150);
 3194   ins_encode %{
 3195     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3196   %}
 3197   ins_pipe(pipe_slow);
 3198 %}
 3199 
 3200 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3201   predicate(UseAVX > 0);
 3202   match(Set dst (MulF src1 src2));
 3203 
 3204   format %{ "vmulss  $dst, $src1, $src2" %}
 3205   ins_cost(150);
 3206   ins_encode %{
 3207     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3208   %}
 3209   ins_pipe(pipe_slow);
 3210 %}
 3211 
 3212 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3213   predicate(UseAVX > 0);
 3214   match(Set dst (MulF src1 (LoadF src2)));
 3215 
 3216   format %{ "vmulss  $dst, $src1, $src2" %}
 3217   ins_cost(150);
 3218   ins_encode %{
 3219     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3220   %}
 3221   ins_pipe(pipe_slow);
 3222 %}
 3223 
 3224 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3225   predicate(UseAVX > 0);
 3226   match(Set dst (MulF src con));
 3227 
 3228   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3229   ins_cost(150);
 3230   ins_encode %{
 3231     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3232   %}
 3233   ins_pipe(pipe_slow);
 3234 %}
 3235 
 3236 instruct mulD_reg(regD dst, regD src) %{
 3237   predicate(UseAVX == 0);
 3238   match(Set dst (MulD dst src));
 3239 
 3240   format %{ "mulsd   $dst, $src" %}
 3241   ins_cost(150);
 3242   ins_encode %{
 3243     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3244   %}
 3245   ins_pipe(pipe_slow);
 3246 %}
 3247 
 3248 instruct mulD_mem(regD dst, memory src) %{
 3249   predicate(UseAVX == 0);
 3250   match(Set dst (MulD dst (LoadD src)));
 3251 
 3252   format %{ "mulsd   $dst, $src" %}
 3253   ins_cost(150);
 3254   ins_encode %{
 3255     __ mulsd($dst$$XMMRegister, $src$$Address);
 3256   %}
 3257   ins_pipe(pipe_slow);
 3258 %}
 3259 
 3260 instruct mulD_imm(regD dst, immD con) %{
 3261   predicate(UseAVX == 0);
 3262   match(Set dst (MulD dst con));
 3263   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3264   ins_cost(150);
 3265   ins_encode %{
 3266     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3267   %}
 3268   ins_pipe(pipe_slow);
 3269 %}
 3270 
 3271 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3272   predicate(UseAVX > 0);
 3273   match(Set dst (MulD src1 src2));
 3274 
 3275   format %{ "vmulsd  $dst, $src1, $src2" %}
 3276   ins_cost(150);
 3277   ins_encode %{
 3278     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3279   %}
 3280   ins_pipe(pipe_slow);
 3281 %}
 3282 
 3283 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3284   predicate(UseAVX > 0);
 3285   match(Set dst (MulD src1 (LoadD src2)));
 3286 
 3287   format %{ "vmulsd  $dst, $src1, $src2" %}
 3288   ins_cost(150);
 3289   ins_encode %{
 3290     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3291   %}
 3292   ins_pipe(pipe_slow);
 3293 %}
 3294 
 3295 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3296   predicate(UseAVX > 0);
 3297   match(Set dst (MulD src con));
 3298 
 3299   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3300   ins_cost(150);
 3301   ins_encode %{
 3302     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3303   %}
 3304   ins_pipe(pipe_slow);
 3305 %}
 3306 
 3307 instruct divF_reg(regF dst, regF src) %{
 3308   predicate(UseAVX == 0);
 3309   match(Set dst (DivF dst src));
 3310 
 3311   format %{ "divss   $dst, $src" %}
 3312   ins_cost(150);
 3313   ins_encode %{
 3314     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3315   %}
 3316   ins_pipe(pipe_slow);
 3317 %}
 3318 
 3319 instruct divF_mem(regF dst, memory src) %{
 3320   predicate(UseAVX == 0);
 3321   match(Set dst (DivF dst (LoadF src)));
 3322 
 3323   format %{ "divss   $dst, $src" %}
 3324   ins_cost(150);
 3325   ins_encode %{
 3326     __ divss($dst$$XMMRegister, $src$$Address);
 3327   %}
 3328   ins_pipe(pipe_slow);
 3329 %}
 3330 
 3331 instruct divF_imm(regF dst, immF con) %{
 3332   predicate(UseAVX == 0);
 3333   match(Set dst (DivF dst con));
 3334   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3335   ins_cost(150);
 3336   ins_encode %{
 3337     __ divss($dst$$XMMRegister, $constantaddress($con));
 3338   %}
 3339   ins_pipe(pipe_slow);
 3340 %}
 3341 
 3342 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3343   predicate(UseAVX > 0);
 3344   match(Set dst (DivF src1 src2));
 3345 
 3346   format %{ "vdivss  $dst, $src1, $src2" %}
 3347   ins_cost(150);
 3348   ins_encode %{
 3349     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3350   %}
 3351   ins_pipe(pipe_slow);
 3352 %}
 3353 
 3354 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3355   predicate(UseAVX > 0);
 3356   match(Set dst (DivF src1 (LoadF src2)));
 3357 
 3358   format %{ "vdivss  $dst, $src1, $src2" %}
 3359   ins_cost(150);
 3360   ins_encode %{
 3361     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3362   %}
 3363   ins_pipe(pipe_slow);
 3364 %}
 3365 
 3366 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3367   predicate(UseAVX > 0);
 3368   match(Set dst (DivF src con));
 3369 
 3370   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3371   ins_cost(150);
 3372   ins_encode %{
 3373     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3374   %}
 3375   ins_pipe(pipe_slow);
 3376 %}
 3377 
 3378 instruct divD_reg(regD dst, regD src) %{
 3379   predicate(UseAVX == 0);
 3380   match(Set dst (DivD dst src));
 3381 
 3382   format %{ "divsd   $dst, $src" %}
 3383   ins_cost(150);
 3384   ins_encode %{
 3385     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3386   %}
 3387   ins_pipe(pipe_slow);
 3388 %}
 3389 
 3390 instruct divD_mem(regD dst, memory src) %{
 3391   predicate(UseAVX == 0);
 3392   match(Set dst (DivD dst (LoadD src)));
 3393 
 3394   format %{ "divsd   $dst, $src" %}
 3395   ins_cost(150);
 3396   ins_encode %{
 3397     __ divsd($dst$$XMMRegister, $src$$Address);
 3398   %}
 3399   ins_pipe(pipe_slow);
 3400 %}
 3401 
 3402 instruct divD_imm(regD dst, immD con) %{
 3403   predicate(UseAVX == 0);
 3404   match(Set dst (DivD dst con));
 3405   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3406   ins_cost(150);
 3407   ins_encode %{
 3408     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3409   %}
 3410   ins_pipe(pipe_slow);
 3411 %}
 3412 
 3413 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3414   predicate(UseAVX > 0);
 3415   match(Set dst (DivD src1 src2));
 3416 
 3417   format %{ "vdivsd  $dst, $src1, $src2" %}
 3418   ins_cost(150);
 3419   ins_encode %{
 3420     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3421   %}
 3422   ins_pipe(pipe_slow);
 3423 %}
 3424 
 3425 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3426   predicate(UseAVX > 0);
 3427   match(Set dst (DivD src1 (LoadD src2)));
 3428 
 3429   format %{ "vdivsd  $dst, $src1, $src2" %}
 3430   ins_cost(150);
 3431   ins_encode %{
 3432     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3433   %}
 3434   ins_pipe(pipe_slow);
 3435 %}
 3436 
 3437 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3438   predicate(UseAVX > 0);
 3439   match(Set dst (DivD src con));
 3440 
 3441   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3442   ins_cost(150);
 3443   ins_encode %{
 3444     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3445   %}
 3446   ins_pipe(pipe_slow);
 3447 %}
 3448 
 3449 instruct absF_reg(regF dst) %{
 3450   predicate(UseAVX == 0);
 3451   match(Set dst (AbsF dst));
 3452   ins_cost(150);
 3453   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3454   ins_encode %{
 3455     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3456   %}
 3457   ins_pipe(pipe_slow);
 3458 %}
 3459 
 3460 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3461   predicate(UseAVX > 0);
 3462   match(Set dst (AbsF src));
 3463   ins_cost(150);
 3464   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3465   ins_encode %{
 3466     int vlen_enc = Assembler::AVX_128bit;
 3467     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3468               ExternalAddress(float_signmask()), vlen_enc);
 3469   %}
 3470   ins_pipe(pipe_slow);
 3471 %}
 3472 
 3473 instruct absD_reg(regD dst) %{
 3474   predicate(UseAVX == 0);
 3475   match(Set dst (AbsD dst));
 3476   ins_cost(150);
 3477   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3478             "# abs double by sign masking" %}
 3479   ins_encode %{
 3480     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3481   %}
 3482   ins_pipe(pipe_slow);
 3483 %}
 3484 
 3485 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3486   predicate(UseAVX > 0);
 3487   match(Set dst (AbsD src));
 3488   ins_cost(150);
 3489   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3490             "# abs double by sign masking" %}
 3491   ins_encode %{
 3492     int vlen_enc = Assembler::AVX_128bit;
 3493     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3494               ExternalAddress(double_signmask()), vlen_enc);
 3495   %}
 3496   ins_pipe(pipe_slow);
 3497 %}
 3498 
 3499 instruct negF_reg(regF dst) %{
 3500   predicate(UseAVX == 0);
 3501   match(Set dst (NegF dst));
 3502   ins_cost(150);
 3503   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3504   ins_encode %{
 3505     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3506   %}
 3507   ins_pipe(pipe_slow);
 3508 %}
 3509 
 3510 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3511   predicate(UseAVX > 0);
 3512   match(Set dst (NegF src));
 3513   ins_cost(150);
 3514   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3515   ins_encode %{
 3516     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3517                  ExternalAddress(float_signflip()));
 3518   %}
 3519   ins_pipe(pipe_slow);
 3520 %}
 3521 
 3522 instruct negD_reg(regD dst) %{
 3523   predicate(UseAVX == 0);
 3524   match(Set dst (NegD dst));
 3525   ins_cost(150);
 3526   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3527             "# neg double by sign flipping" %}
 3528   ins_encode %{
 3529     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3530   %}
 3531   ins_pipe(pipe_slow);
 3532 %}
 3533 
 3534 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3535   predicate(UseAVX > 0);
 3536   match(Set dst (NegD src));
 3537   ins_cost(150);
 3538   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3539             "# neg double by sign flipping" %}
 3540   ins_encode %{
 3541     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3542                  ExternalAddress(double_signflip()));
 3543   %}
 3544   ins_pipe(pipe_slow);
 3545 %}
 3546 
 3547 // sqrtss instruction needs destination register to be pre initialized for best performance
 3548 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3549 instruct sqrtF_reg(regF dst) %{
 3550   match(Set dst (SqrtF dst));
 3551   format %{ "sqrtss  $dst, $dst" %}
 3552   ins_encode %{
 3553     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3554   %}
 3555   ins_pipe(pipe_slow);
 3556 %}
 3557 
 3558 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3559 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3560 instruct sqrtD_reg(regD dst) %{
 3561   match(Set dst (SqrtD dst));
 3562   format %{ "sqrtsd  $dst, $dst" %}
 3563   ins_encode %{
 3564     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3565   %}
 3566   ins_pipe(pipe_slow);
 3567 %}
 3568 
 3569 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3570   effect(TEMP tmp);
 3571   match(Set dst (ConvF2HF src));
 3572   ins_cost(125);
 3573   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3574   ins_encode %{
 3575     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3576   %}
 3577   ins_pipe( pipe_slow );
 3578 %}
 3579 
 3580 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3581   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3582   effect(TEMP ktmp, TEMP rtmp);
 3583   match(Set mem (StoreC mem (ConvF2HF src)));
 3584   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3585   ins_encode %{
 3586     __ movl($rtmp$$Register, 0x1);
 3587     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3588     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3589   %}
 3590   ins_pipe( pipe_slow );
 3591 %}
 3592 
 3593 instruct vconvF2HF(vec dst, vec src) %{
 3594   match(Set dst (VectorCastF2HF src));
 3595   format %{ "vector_conv_F2HF $dst $src" %}
 3596   ins_encode %{
 3597     int vlen_enc = vector_length_encoding(this, $src);
 3598     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3599   %}
 3600   ins_pipe( pipe_slow );
 3601 %}
 3602 
 3603 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3604   predicate(n->as_StoreVector()->memory_size() >= 16);
 3605   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3606   format %{ "vcvtps2ph $mem,$src" %}
 3607   ins_encode %{
 3608     int vlen_enc = vector_length_encoding(this, $src);
 3609     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3610   %}
 3611   ins_pipe( pipe_slow );
 3612 %}
 3613 
 3614 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3615   match(Set dst (ConvHF2F src));
 3616   format %{ "vcvtph2ps $dst,$src" %}
 3617   ins_encode %{
 3618     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3619   %}
 3620   ins_pipe( pipe_slow );
 3621 %}
 3622 
 3623 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3624   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3625   format %{ "vcvtph2ps $dst,$mem" %}
 3626   ins_encode %{
 3627     int vlen_enc = vector_length_encoding(this);
 3628     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3629   %}
 3630   ins_pipe( pipe_slow );
 3631 %}
 3632 
 3633 instruct vconvHF2F(vec dst, vec src) %{
 3634   match(Set dst (VectorCastHF2F src));
 3635   ins_cost(125);
 3636   format %{ "vector_conv_HF2F $dst,$src" %}
 3637   ins_encode %{
 3638     int vlen_enc = vector_length_encoding(this);
 3639     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3640   %}
 3641   ins_pipe( pipe_slow );
 3642 %}
 3643 
 3644 // ---------------------------------------- VectorReinterpret ------------------------------------
 3645 instruct reinterpret_mask(kReg dst) %{
 3646   predicate(n->bottom_type()->isa_vectmask() &&
 3647             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3648   match(Set dst (VectorReinterpret dst));
 3649   ins_cost(125);
 3650   format %{ "vector_reinterpret $dst\t!" %}
 3651   ins_encode %{
 3652     // empty
 3653   %}
 3654   ins_pipe( pipe_slow );
 3655 %}
 3656 
 3657 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3658   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3659             n->bottom_type()->isa_vectmask() &&
 3660             n->in(1)->bottom_type()->isa_vectmask() &&
 3661             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3662             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3663   match(Set dst (VectorReinterpret src));
 3664   effect(TEMP xtmp);
 3665   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3666   ins_encode %{
 3667      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3668      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3669      assert(src_sz == dst_sz , "src and dst size mismatch");
 3670      int vlen_enc = vector_length_encoding(src_sz);
 3671      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3672      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3673   %}
 3674   ins_pipe( pipe_slow );
 3675 %}
 3676 
 3677 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3678   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3679             n->bottom_type()->isa_vectmask() &&
 3680             n->in(1)->bottom_type()->isa_vectmask() &&
 3681             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3682              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3683             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3684   match(Set dst (VectorReinterpret src));
 3685   effect(TEMP xtmp);
 3686   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3687   ins_encode %{
 3688      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3689      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3690      assert(src_sz == dst_sz , "src and dst size mismatch");
 3691      int vlen_enc = vector_length_encoding(src_sz);
 3692      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3693      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3694   %}
 3695   ins_pipe( pipe_slow );
 3696 %}
 3697 
 3698 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3699   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3700             n->bottom_type()->isa_vectmask() &&
 3701             n->in(1)->bottom_type()->isa_vectmask() &&
 3702             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3703              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3704             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3705   match(Set dst (VectorReinterpret src));
 3706   effect(TEMP xtmp);
 3707   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3708   ins_encode %{
 3709      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3710      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3711      assert(src_sz == dst_sz , "src and dst size mismatch");
 3712      int vlen_enc = vector_length_encoding(src_sz);
 3713      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3714      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3715   %}
 3716   ins_pipe( pipe_slow );
 3717 %}
 3718 
 3719 instruct reinterpret(vec dst) %{
 3720   predicate(!n->bottom_type()->isa_vectmask() &&
 3721             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3722   match(Set dst (VectorReinterpret dst));
 3723   ins_cost(125);
 3724   format %{ "vector_reinterpret $dst\t!" %}
 3725   ins_encode %{
 3726     // empty
 3727   %}
 3728   ins_pipe( pipe_slow );
 3729 %}
 3730 
 3731 instruct reinterpret_expand(vec dst, vec src) %{
 3732   predicate(UseAVX == 0 &&
 3733             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3734   match(Set dst (VectorReinterpret src));
 3735   ins_cost(125);
 3736   effect(TEMP dst);
 3737   format %{ "vector_reinterpret_expand $dst,$src" %}
 3738   ins_encode %{
 3739     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3740     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3741 
 3742     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3743     if (src_vlen_in_bytes == 4) {
 3744       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3745     } else {
 3746       assert(src_vlen_in_bytes == 8, "");
 3747       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3748     }
 3749     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3750   %}
 3751   ins_pipe( pipe_slow );
 3752 %}
 3753 
 3754 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3755   predicate(UseAVX > 0 &&
 3756             !n->bottom_type()->isa_vectmask() &&
 3757             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3758             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3759   match(Set dst (VectorReinterpret src));
 3760   ins_cost(125);
 3761   format %{ "vector_reinterpret_expand $dst,$src" %}
 3762   ins_encode %{
 3763     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3764   %}
 3765   ins_pipe( pipe_slow );
 3766 %}
 3767 
 3768 
 3769 instruct vreinterpret_expand(legVec dst, vec src) %{
 3770   predicate(UseAVX > 0 &&
 3771             !n->bottom_type()->isa_vectmask() &&
 3772             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3773             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3774   match(Set dst (VectorReinterpret src));
 3775   ins_cost(125);
 3776   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3777   ins_encode %{
 3778     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3779       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3780       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3781       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3782       default: ShouldNotReachHere();
 3783     }
 3784   %}
 3785   ins_pipe( pipe_slow );
 3786 %}
 3787 
 3788 instruct reinterpret_shrink(vec dst, legVec src) %{
 3789   predicate(!n->bottom_type()->isa_vectmask() &&
 3790             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3791   match(Set dst (VectorReinterpret src));
 3792   ins_cost(125);
 3793   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3794   ins_encode %{
 3795     switch (Matcher::vector_length_in_bytes(this)) {
 3796       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3797       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3798       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3799       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3800       default: ShouldNotReachHere();
 3801     }
 3802   %}
 3803   ins_pipe( pipe_slow );
 3804 %}
 3805 
 3806 // ----------------------------------------------------------------------------------------------------
 3807 
 3808 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3809   match(Set dst (RoundDoubleMode src rmode));
 3810   format %{ "roundsd $dst,$src" %}
 3811   ins_cost(150);
 3812   ins_encode %{
 3813     assert(UseSSE >= 4, "required");
 3814     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3815       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3816     }
 3817     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3818   %}
 3819   ins_pipe(pipe_slow);
 3820 %}
 3821 
 3822 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3823   match(Set dst (RoundDoubleMode con rmode));
 3824   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3825   ins_cost(150);
 3826   ins_encode %{
 3827     assert(UseSSE >= 4, "required");
 3828     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3829   %}
 3830   ins_pipe(pipe_slow);
 3831 %}
 3832 
 3833 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3834   predicate(Matcher::vector_length(n) < 8);
 3835   match(Set dst (RoundDoubleModeV src rmode));
 3836   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3837   ins_encode %{
 3838     assert(UseAVX > 0, "required");
 3839     int vlen_enc = vector_length_encoding(this);
 3840     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3841   %}
 3842   ins_pipe( pipe_slow );
 3843 %}
 3844 
 3845 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3846   predicate(Matcher::vector_length(n) == 8);
 3847   match(Set dst (RoundDoubleModeV src rmode));
 3848   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3849   ins_encode %{
 3850     assert(UseAVX > 2, "required");
 3851     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3852   %}
 3853   ins_pipe( pipe_slow );
 3854 %}
 3855 
 3856 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3857   predicate(Matcher::vector_length(n) < 8);
 3858   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3859   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3860   ins_encode %{
 3861     assert(UseAVX > 0, "required");
 3862     int vlen_enc = vector_length_encoding(this);
 3863     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3864   %}
 3865   ins_pipe( pipe_slow );
 3866 %}
 3867 
 3868 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3869   predicate(Matcher::vector_length(n) == 8);
 3870   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3871   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3872   ins_encode %{
 3873     assert(UseAVX > 2, "required");
 3874     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3875   %}
 3876   ins_pipe( pipe_slow );
 3877 %}
 3878 
 3879 instruct onspinwait() %{
 3880   match(OnSpinWait);
 3881   ins_cost(200);
 3882 
 3883   format %{
 3884     $$template
 3885     $$emit$$"pause\t! membar_onspinwait"
 3886   %}
 3887   ins_encode %{
 3888     __ pause();
 3889   %}
 3890   ins_pipe(pipe_slow);
 3891 %}
 3892 
 3893 // a * b + c
 3894 instruct fmaD_reg(regD a, regD b, regD c) %{
 3895   match(Set c (FmaD  c (Binary a b)));
 3896   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3897   ins_cost(150);
 3898   ins_encode %{
 3899     assert(UseFMA, "Needs FMA instructions support.");
 3900     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3901   %}
 3902   ins_pipe( pipe_slow );
 3903 %}
 3904 
 3905 // a * b + c
 3906 instruct fmaF_reg(regF a, regF b, regF c) %{
 3907   match(Set c (FmaF  c (Binary a b)));
 3908   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3909   ins_cost(150);
 3910   ins_encode %{
 3911     assert(UseFMA, "Needs FMA instructions support.");
 3912     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3913   %}
 3914   ins_pipe( pipe_slow );
 3915 %}
 3916 
 3917 // ====================VECTOR INSTRUCTIONS=====================================
 3918 
 3919 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3920 instruct MoveVec2Leg(legVec dst, vec src) %{
 3921   match(Set dst src);
 3922   format %{ "" %}
 3923   ins_encode %{
 3924     ShouldNotReachHere();
 3925   %}
 3926   ins_pipe( fpu_reg_reg );
 3927 %}
 3928 
 3929 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3930   match(Set dst src);
 3931   format %{ "" %}
 3932   ins_encode %{
 3933     ShouldNotReachHere();
 3934   %}
 3935   ins_pipe( fpu_reg_reg );
 3936 %}
 3937 
 3938 // ============================================================================
 3939 
 3940 // Load vectors generic operand pattern
 3941 instruct loadV(vec dst, memory mem) %{
 3942   match(Set dst (LoadVector mem));
 3943   ins_cost(125);
 3944   format %{ "load_vector $dst,$mem" %}
 3945   ins_encode %{
 3946     BasicType bt = Matcher::vector_element_basic_type(this);
 3947     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3948   %}
 3949   ins_pipe( pipe_slow );
 3950 %}
 3951 
 3952 // Store vectors generic operand pattern.
 3953 instruct storeV(memory mem, vec src) %{
 3954   match(Set mem (StoreVector mem src));
 3955   ins_cost(145);
 3956   format %{ "store_vector $mem,$src\n\t" %}
 3957   ins_encode %{
 3958     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3959       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3960       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3961       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3962       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3963       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3964       default: ShouldNotReachHere();
 3965     }
 3966   %}
 3967   ins_pipe( pipe_slow );
 3968 %}
 3969 
 3970 // ---------------------------------------- Gather ------------------------------------
 3971 
 3972 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 3973 
 3974 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 3975   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 3976             Matcher::vector_length_in_bytes(n) <= 32);
 3977   match(Set dst (LoadVectorGather mem idx));
 3978   effect(TEMP dst, TEMP tmp, TEMP mask);
 3979   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 3980   ins_encode %{
 3981     int vlen_enc = vector_length_encoding(this);
 3982     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3983     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 3984     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3985     __ lea($tmp$$Register, $mem$$Address);
 3986     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3987   %}
 3988   ins_pipe( pipe_slow );
 3989 %}
 3990 
 3991 
 3992 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 3993   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 3994             !is_subword_type(Matcher::vector_element_basic_type(n)));
 3995   match(Set dst (LoadVectorGather mem idx));
 3996   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 3997   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 3998   ins_encode %{
 3999     int vlen_enc = vector_length_encoding(this);
 4000     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4001     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4002     __ lea($tmp$$Register, $mem$$Address);
 4003     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4004   %}
 4005   ins_pipe( pipe_slow );
 4006 %}
 4007 
 4008 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4009   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4010             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4011   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4012   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4013   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4014   ins_encode %{
 4015     assert(UseAVX > 2, "sanity");
 4016     int vlen_enc = vector_length_encoding(this);
 4017     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4018     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4019     // Note: Since gather instruction partially updates the opmask register used
 4020     // for predication hense moving mask operand to a temporary.
 4021     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4022     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4023     __ lea($tmp$$Register, $mem$$Address);
 4024     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4025   %}
 4026   ins_pipe( pipe_slow );
 4027 %}
 4028 
 4029 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegI rtmp) %{
 4030   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4031   match(Set dst (LoadVectorGather mem idx_base));
 4032   effect(TEMP tmp, TEMP rtmp);
 4033   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4034   ins_encode %{
 4035     int vlen_enc = vector_length_encoding(this);
 4036     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4037     __ lea($tmp$$Register, $mem$$Address);
 4038     __ vgather8b(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp$$Register, vlen_enc);
 4039   %}
 4040   ins_pipe( pipe_slow );
 4041 %}
 4042 
 4043 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegP idx_base_temp,
 4044                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4045   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4046   match(Set dst (LoadVectorGather mem idx_base));
 4047   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4048   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4049   ins_encode %{
 4050     int vlen_enc = vector_length_encoding(this);
 4051     int vector_len = Matcher::vector_length(this);
 4052     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4053     __ lea($tmp$$Register, $mem$$Address);
 4054     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4055     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $xtmp1$$XMMRegister,
 4056                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4057   %}
 4058   ins_pipe( pipe_slow );
 4059 %}
 4060 
 4061 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4062   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4063   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4064   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4065   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4066   ins_encode %{
 4067     int vlen_enc = vector_length_encoding(this);
 4068     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4069     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4070     __ lea($tmp$$Register, $mem$$Address);
 4071     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4072     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4073   %}
 4074   ins_pipe( pipe_slow );
 4075 %}
 4076 
 4077 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4078                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4079   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4080   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4081   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4082   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4083   ins_encode %{
 4084     int vlen_enc = vector_length_encoding(this);
 4085     int vector_len = Matcher::vector_length(this);
 4086     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4087     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4088     __ lea($tmp$$Register, $mem$$Address);
 4089     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4090     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4091     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4092                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4093   %}
 4094   ins_pipe( pipe_slow );
 4095 %}
 4096 
 4097 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4098   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4099   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4100   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4101   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4102   ins_encode %{
 4103     int vlen_enc = vector_length_encoding(this);
 4104     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4105     __ lea($tmp$$Register, $mem$$Address);
 4106     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4107     if (elem_bt == T_SHORT) {
 4108       __ movl($mask_idx$$Register, 0x55555555);
 4109       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4110     }
 4111     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4112     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4113   %}
 4114   ins_pipe( pipe_slow );
 4115 %}
 4116 
 4117 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegP tmp, rRegP idx_base_temp,
 4118                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4119   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4120   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4121   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4122   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4123   ins_encode %{
 4124     int vlen_enc = vector_length_encoding(this);
 4125     int vector_len = Matcher::vector_length(this);
 4126     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4127     __ lea($tmp$$Register, $mem$$Address);
 4128     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4129     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4130     if (elem_bt == T_SHORT) {
 4131       __ movl($mask_idx$$Register, 0x55555555);
 4132       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4133     }
 4134     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4135     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4136                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4137   %}
 4138   ins_pipe( pipe_slow );
 4139 %}
 4140 
 4141 // ====================Scatter=======================================
 4142 
 4143 // Scatter INT, LONG, FLOAT, DOUBLE
 4144 
 4145 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4146   predicate(UseAVX > 2);
 4147   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4148   effect(TEMP tmp, TEMP ktmp);
 4149   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4150   ins_encode %{
 4151     int vlen_enc = vector_length_encoding(this, $src);
 4152     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4153 
 4154     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4155     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4156 
 4157     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4158     __ lea($tmp$$Register, $mem$$Address);
 4159     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4160   %}
 4161   ins_pipe( pipe_slow );
 4162 %}
 4163 
 4164 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4165   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4166   effect(TEMP tmp, TEMP ktmp);
 4167   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4168   ins_encode %{
 4169     int vlen_enc = vector_length_encoding(this, $src);
 4170     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4171     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4172     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4173     // Note: Since scatter instruction partially updates the opmask register used
 4174     // for predication hense moving mask operand to a temporary.
 4175     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4176     __ lea($tmp$$Register, $mem$$Address);
 4177     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4178   %}
 4179   ins_pipe( pipe_slow );
 4180 %}
 4181 
 4182 // ====================REPLICATE=======================================
 4183 
 4184 // Replicate byte scalar to be vector
 4185 instruct vReplB_reg(vec dst, rRegI src) %{
 4186   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4187   match(Set dst (Replicate src));
 4188   format %{ "replicateB $dst,$src" %}
 4189   ins_encode %{
 4190     uint vlen = Matcher::vector_length(this);
 4191     if (UseAVX >= 2) {
 4192       int vlen_enc = vector_length_encoding(this);
 4193       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4194         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4195         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4196       } else {
 4197         __ movdl($dst$$XMMRegister, $src$$Register);
 4198         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4199       }
 4200     } else {
 4201        assert(UseAVX < 2, "");
 4202       __ movdl($dst$$XMMRegister, $src$$Register);
 4203       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4204       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4205       if (vlen >= 16) {
 4206         assert(vlen == 16, "");
 4207         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4208       }
 4209     }
 4210   %}
 4211   ins_pipe( pipe_slow );
 4212 %}
 4213 
 4214 instruct ReplB_mem(vec dst, memory mem) %{
 4215   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4216   match(Set dst (Replicate (LoadB mem)));
 4217   format %{ "replicateB $dst,$mem" %}
 4218   ins_encode %{
 4219     int vlen_enc = vector_length_encoding(this);
 4220     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4221   %}
 4222   ins_pipe( pipe_slow );
 4223 %}
 4224 
 4225 // ====================ReplicateS=======================================
 4226 
 4227 instruct vReplS_reg(vec dst, rRegI src) %{
 4228   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4229   match(Set dst (Replicate src));
 4230   format %{ "replicateS $dst,$src" %}
 4231   ins_encode %{
 4232     uint vlen = Matcher::vector_length(this);
 4233     int vlen_enc = vector_length_encoding(this);
 4234     if (UseAVX >= 2) {
 4235       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4236         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4237         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4238       } else {
 4239         __ movdl($dst$$XMMRegister, $src$$Register);
 4240         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4241       }
 4242     } else {
 4243       assert(UseAVX < 2, "");
 4244       __ movdl($dst$$XMMRegister, $src$$Register);
 4245       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4246       if (vlen >= 8) {
 4247         assert(vlen == 8, "");
 4248         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4249       }
 4250     }
 4251   %}
 4252   ins_pipe( pipe_slow );
 4253 %}
 4254 
 4255 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4256   match(Set dst (Replicate con));
 4257   effect(TEMP rtmp);
 4258   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4259   ins_encode %{
 4260     int vlen_enc = vector_length_encoding(this);
 4261     BasicType bt = Matcher::vector_element_basic_type(this);
 4262     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4263     __ movl($rtmp$$Register, $con$$constant);
 4264     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4265   %}
 4266   ins_pipe( pipe_slow );
 4267 %}
 4268 
 4269 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4270   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4271   match(Set dst (Replicate src));
 4272   effect(TEMP rtmp);
 4273   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4274   ins_encode %{
 4275     int vlen_enc = vector_length_encoding(this);
 4276     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4277     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4278   %}
 4279   ins_pipe( pipe_slow );
 4280 %}
 4281 
 4282 instruct ReplS_mem(vec dst, memory mem) %{
 4283   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4284   match(Set dst (Replicate (LoadS mem)));
 4285   format %{ "replicateS $dst,$mem" %}
 4286   ins_encode %{
 4287     int vlen_enc = vector_length_encoding(this);
 4288     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4289   %}
 4290   ins_pipe( pipe_slow );
 4291 %}
 4292 
 4293 // ====================ReplicateI=======================================
 4294 
 4295 instruct ReplI_reg(vec dst, rRegI src) %{
 4296   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4297   match(Set dst (Replicate src));
 4298   format %{ "replicateI $dst,$src" %}
 4299   ins_encode %{
 4300     uint vlen = Matcher::vector_length(this);
 4301     int vlen_enc = vector_length_encoding(this);
 4302     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4303       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4304     } else if (VM_Version::supports_avx2()) {
 4305       __ movdl($dst$$XMMRegister, $src$$Register);
 4306       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4307     } else {
 4308       __ movdl($dst$$XMMRegister, $src$$Register);
 4309       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4310     }
 4311   %}
 4312   ins_pipe( pipe_slow );
 4313 %}
 4314 
 4315 instruct ReplI_mem(vec dst, memory mem) %{
 4316   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4317   match(Set dst (Replicate (LoadI mem)));
 4318   format %{ "replicateI $dst,$mem" %}
 4319   ins_encode %{
 4320     int vlen_enc = vector_length_encoding(this);
 4321     if (VM_Version::supports_avx2()) {
 4322       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4323     } else if (VM_Version::supports_avx()) {
 4324       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4325     } else {
 4326       __ movdl($dst$$XMMRegister, $mem$$Address);
 4327       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4328     }
 4329   %}
 4330   ins_pipe( pipe_slow );
 4331 %}
 4332 
 4333 instruct ReplI_imm(vec dst, immI con) %{
 4334   predicate(Matcher::is_non_long_integral_vector(n));
 4335   match(Set dst (Replicate con));
 4336   format %{ "replicateI $dst,$con" %}
 4337   ins_encode %{
 4338     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4339                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4340                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4341     BasicType bt = Matcher::vector_element_basic_type(this);
 4342     int vlen = Matcher::vector_length_in_bytes(this);
 4343     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4344   %}
 4345   ins_pipe( pipe_slow );
 4346 %}
 4347 
 4348 // Replicate scalar zero to be vector
 4349 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4350   predicate(Matcher::is_non_long_integral_vector(n));
 4351   match(Set dst (Replicate zero));
 4352   format %{ "replicateI $dst,$zero" %}
 4353   ins_encode %{
 4354     int vlen_enc = vector_length_encoding(this);
 4355     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4356       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4357     } else {
 4358       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4359     }
 4360   %}
 4361   ins_pipe( fpu_reg_reg );
 4362 %}
 4363 
 4364 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4365   predicate(Matcher::is_non_long_integral_vector(n));
 4366   match(Set dst (Replicate con));
 4367   format %{ "vallones $dst" %}
 4368   ins_encode %{
 4369     int vector_len = vector_length_encoding(this);
 4370     __ vallones($dst$$XMMRegister, vector_len);
 4371   %}
 4372   ins_pipe( pipe_slow );
 4373 %}
 4374 
 4375 // ====================ReplicateL=======================================
 4376 
 4377 // Replicate long (8 byte) scalar to be vector
 4378 instruct ReplL_reg(vec dst, rRegL src) %{
 4379   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4380   match(Set dst (Replicate src));
 4381   format %{ "replicateL $dst,$src" %}
 4382   ins_encode %{
 4383     int vlen = Matcher::vector_length(this);
 4384     int vlen_enc = vector_length_encoding(this);
 4385     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4386       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4387     } else if (VM_Version::supports_avx2()) {
 4388       __ movdq($dst$$XMMRegister, $src$$Register);
 4389       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4390     } else {
 4391       __ movdq($dst$$XMMRegister, $src$$Register);
 4392       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4393     }
 4394   %}
 4395   ins_pipe( pipe_slow );
 4396 %}
 4397 
 4398 instruct ReplL_mem(vec dst, memory mem) %{
 4399   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4400   match(Set dst (Replicate (LoadL mem)));
 4401   format %{ "replicateL $dst,$mem" %}
 4402   ins_encode %{
 4403     int vlen_enc = vector_length_encoding(this);
 4404     if (VM_Version::supports_avx2()) {
 4405       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4406     } else if (VM_Version::supports_sse3()) {
 4407       __ movddup($dst$$XMMRegister, $mem$$Address);
 4408     } else {
 4409       __ movq($dst$$XMMRegister, $mem$$Address);
 4410       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4411     }
 4412   %}
 4413   ins_pipe( pipe_slow );
 4414 %}
 4415 
 4416 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4417 instruct ReplL_imm(vec dst, immL con) %{
 4418   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4419   match(Set dst (Replicate con));
 4420   format %{ "replicateL $dst,$con" %}
 4421   ins_encode %{
 4422     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4423     int vlen = Matcher::vector_length_in_bytes(this);
 4424     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4425   %}
 4426   ins_pipe( pipe_slow );
 4427 %}
 4428 
 4429 instruct ReplL_zero(vec dst, immL0 zero) %{
 4430   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4431   match(Set dst (Replicate zero));
 4432   format %{ "replicateL $dst,$zero" %}
 4433   ins_encode %{
 4434     int vlen_enc = vector_length_encoding(this);
 4435     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4436       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4437     } else {
 4438       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4439     }
 4440   %}
 4441   ins_pipe( fpu_reg_reg );
 4442 %}
 4443 
 4444 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4445   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4446   match(Set dst (Replicate con));
 4447   format %{ "vallones $dst" %}
 4448   ins_encode %{
 4449     int vector_len = vector_length_encoding(this);
 4450     __ vallones($dst$$XMMRegister, vector_len);
 4451   %}
 4452   ins_pipe( pipe_slow );
 4453 %}
 4454 
 4455 // ====================ReplicateF=======================================
 4456 
 4457 instruct vReplF_reg(vec dst, vlRegF src) %{
 4458   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4459   match(Set dst (Replicate src));
 4460   format %{ "replicateF $dst,$src" %}
 4461   ins_encode %{
 4462     uint vlen = Matcher::vector_length(this);
 4463     int vlen_enc = vector_length_encoding(this);
 4464     if (vlen <= 4) {
 4465       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4466     } else if (VM_Version::supports_avx2()) {
 4467       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4468     } else {
 4469       assert(vlen == 8, "sanity");
 4470       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4471       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4472     }
 4473   %}
 4474   ins_pipe( pipe_slow );
 4475 %}
 4476 
 4477 instruct ReplF_reg(vec dst, vlRegF src) %{
 4478   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4479   match(Set dst (Replicate src));
 4480   format %{ "replicateF $dst,$src" %}
 4481   ins_encode %{
 4482     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4483   %}
 4484   ins_pipe( pipe_slow );
 4485 %}
 4486 
 4487 instruct ReplF_mem(vec dst, memory mem) %{
 4488   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4489   match(Set dst (Replicate (LoadF mem)));
 4490   format %{ "replicateF $dst,$mem" %}
 4491   ins_encode %{
 4492     int vlen_enc = vector_length_encoding(this);
 4493     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4494   %}
 4495   ins_pipe( pipe_slow );
 4496 %}
 4497 
 4498 // Replicate float scalar immediate to be vector by loading from const table.
 4499 instruct ReplF_imm(vec dst, immF con) %{
 4500   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4501   match(Set dst (Replicate con));
 4502   format %{ "replicateF $dst,$con" %}
 4503   ins_encode %{
 4504     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4505                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4506     int vlen = Matcher::vector_length_in_bytes(this);
 4507     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4508   %}
 4509   ins_pipe( pipe_slow );
 4510 %}
 4511 
 4512 instruct ReplF_zero(vec dst, immF0 zero) %{
 4513   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4514   match(Set dst (Replicate zero));
 4515   format %{ "replicateF $dst,$zero" %}
 4516   ins_encode %{
 4517     int vlen_enc = vector_length_encoding(this);
 4518     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4519       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4520     } else {
 4521       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4522     }
 4523   %}
 4524   ins_pipe( fpu_reg_reg );
 4525 %}
 4526 
 4527 // ====================ReplicateD=======================================
 4528 
 4529 // Replicate double (8 bytes) scalar to be vector
 4530 instruct vReplD_reg(vec dst, vlRegD src) %{
 4531   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4532   match(Set dst (Replicate src));
 4533   format %{ "replicateD $dst,$src" %}
 4534   ins_encode %{
 4535     uint vlen = Matcher::vector_length(this);
 4536     int vlen_enc = vector_length_encoding(this);
 4537     if (vlen <= 2) {
 4538       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4539     } else if (VM_Version::supports_avx2()) {
 4540       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4541     } else {
 4542       assert(vlen == 4, "sanity");
 4543       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4544       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4545     }
 4546   %}
 4547   ins_pipe( pipe_slow );
 4548 %}
 4549 
 4550 instruct ReplD_reg(vec dst, vlRegD src) %{
 4551   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4552   match(Set dst (Replicate src));
 4553   format %{ "replicateD $dst,$src" %}
 4554   ins_encode %{
 4555     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4556   %}
 4557   ins_pipe( pipe_slow );
 4558 %}
 4559 
 4560 instruct ReplD_mem(vec dst, memory mem) %{
 4561   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4562   match(Set dst (Replicate (LoadD mem)));
 4563   format %{ "replicateD $dst,$mem" %}
 4564   ins_encode %{
 4565     if (Matcher::vector_length(this) >= 4) {
 4566       int vlen_enc = vector_length_encoding(this);
 4567       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4568     } else {
 4569       __ movddup($dst$$XMMRegister, $mem$$Address);
 4570     }
 4571   %}
 4572   ins_pipe( pipe_slow );
 4573 %}
 4574 
 4575 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4576 instruct ReplD_imm(vec dst, immD con) %{
 4577   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4578   match(Set dst (Replicate con));
 4579   format %{ "replicateD $dst,$con" %}
 4580   ins_encode %{
 4581     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4582     int vlen = Matcher::vector_length_in_bytes(this);
 4583     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4584   %}
 4585   ins_pipe( pipe_slow );
 4586 %}
 4587 
 4588 instruct ReplD_zero(vec dst, immD0 zero) %{
 4589   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4590   match(Set dst (Replicate zero));
 4591   format %{ "replicateD $dst,$zero" %}
 4592   ins_encode %{
 4593     int vlen_enc = vector_length_encoding(this);
 4594     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4595       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4596     } else {
 4597       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4598     }
 4599   %}
 4600   ins_pipe( fpu_reg_reg );
 4601 %}
 4602 
 4603 // ====================VECTOR INSERT=======================================
 4604 
 4605 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4606   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4607   match(Set dst (VectorInsert (Binary dst val) idx));
 4608   format %{ "vector_insert $dst,$val,$idx" %}
 4609   ins_encode %{
 4610     assert(UseSSE >= 4, "required");
 4611     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4612 
 4613     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4614 
 4615     assert(is_integral_type(elem_bt), "");
 4616     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4617 
 4618     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4619   %}
 4620   ins_pipe( pipe_slow );
 4621 %}
 4622 
 4623 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4624   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4625   match(Set dst (VectorInsert (Binary src val) idx));
 4626   effect(TEMP vtmp);
 4627   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4628   ins_encode %{
 4629     int vlen_enc = Assembler::AVX_256bit;
 4630     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4631     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4632     int log2epr = log2(elem_per_lane);
 4633 
 4634     assert(is_integral_type(elem_bt), "sanity");
 4635     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4636 
 4637     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4638     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4639     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4640     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4641     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4642   %}
 4643   ins_pipe( pipe_slow );
 4644 %}
 4645 
 4646 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4647   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4648   match(Set dst (VectorInsert (Binary src val) idx));
 4649   effect(TEMP vtmp);
 4650   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4651   ins_encode %{
 4652     assert(UseAVX > 2, "sanity");
 4653 
 4654     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4655     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4656     int log2epr = log2(elem_per_lane);
 4657 
 4658     assert(is_integral_type(elem_bt), "");
 4659     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4660 
 4661     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4662     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4663     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4664     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4665     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4666   %}
 4667   ins_pipe( pipe_slow );
 4668 %}
 4669 
 4670 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4671   predicate(Matcher::vector_length(n) == 2);
 4672   match(Set dst (VectorInsert (Binary dst val) idx));
 4673   format %{ "vector_insert $dst,$val,$idx" %}
 4674   ins_encode %{
 4675     assert(UseSSE >= 4, "required");
 4676     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4677     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4678 
 4679     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4680   %}
 4681   ins_pipe( pipe_slow );
 4682 %}
 4683 
 4684 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4685   predicate(Matcher::vector_length(n) == 4);
 4686   match(Set dst (VectorInsert (Binary src val) idx));
 4687   effect(TEMP vtmp);
 4688   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4689   ins_encode %{
 4690     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4691     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4692 
 4693     uint x_idx = $idx$$constant & right_n_bits(1);
 4694     uint y_idx = ($idx$$constant >> 1) & 1;
 4695     int vlen_enc = Assembler::AVX_256bit;
 4696     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4697     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4698     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4699   %}
 4700   ins_pipe( pipe_slow );
 4701 %}
 4702 
 4703 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4704   predicate(Matcher::vector_length(n) == 8);
 4705   match(Set dst (VectorInsert (Binary src val) idx));
 4706   effect(TEMP vtmp);
 4707   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4708   ins_encode %{
 4709     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4710     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4711 
 4712     uint x_idx = $idx$$constant & right_n_bits(1);
 4713     uint y_idx = ($idx$$constant >> 1) & 3;
 4714     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4715     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4716     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4717   %}
 4718   ins_pipe( pipe_slow );
 4719 %}
 4720 
 4721 instruct insertF(vec dst, regF val, immU8 idx) %{
 4722   predicate(Matcher::vector_length(n) < 8);
 4723   match(Set dst (VectorInsert (Binary dst val) idx));
 4724   format %{ "vector_insert $dst,$val,$idx" %}
 4725   ins_encode %{
 4726     assert(UseSSE >= 4, "sanity");
 4727 
 4728     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4729     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4730 
 4731     uint x_idx = $idx$$constant & right_n_bits(2);
 4732     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4733   %}
 4734   ins_pipe( pipe_slow );
 4735 %}
 4736 
 4737 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4738   predicate(Matcher::vector_length(n) >= 8);
 4739   match(Set dst (VectorInsert (Binary src val) idx));
 4740   effect(TEMP vtmp);
 4741   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4742   ins_encode %{
 4743     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4744     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4745 
 4746     int vlen = Matcher::vector_length(this);
 4747     uint x_idx = $idx$$constant & right_n_bits(2);
 4748     if (vlen == 8) {
 4749       uint y_idx = ($idx$$constant >> 2) & 1;
 4750       int vlen_enc = Assembler::AVX_256bit;
 4751       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4752       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4753       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4754     } else {
 4755       assert(vlen == 16, "sanity");
 4756       uint y_idx = ($idx$$constant >> 2) & 3;
 4757       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4758       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4759       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4760     }
 4761   %}
 4762   ins_pipe( pipe_slow );
 4763 %}
 4764 
 4765 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4766   predicate(Matcher::vector_length(n) == 2);
 4767   match(Set dst (VectorInsert (Binary dst val) idx));
 4768   effect(TEMP tmp);
 4769   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4770   ins_encode %{
 4771     assert(UseSSE >= 4, "sanity");
 4772     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4773     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4774 
 4775     __ movq($tmp$$Register, $val$$XMMRegister);
 4776     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4777   %}
 4778   ins_pipe( pipe_slow );
 4779 %}
 4780 
 4781 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4782   predicate(Matcher::vector_length(n) == 4);
 4783   match(Set dst (VectorInsert (Binary src val) idx));
 4784   effect(TEMP vtmp, TEMP tmp);
 4785   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4786   ins_encode %{
 4787     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4788     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4789 
 4790     uint x_idx = $idx$$constant & right_n_bits(1);
 4791     uint y_idx = ($idx$$constant >> 1) & 1;
 4792     int vlen_enc = Assembler::AVX_256bit;
 4793     __ movq($tmp$$Register, $val$$XMMRegister);
 4794     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4795     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4796     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4797   %}
 4798   ins_pipe( pipe_slow );
 4799 %}
 4800 
 4801 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4802   predicate(Matcher::vector_length(n) == 8);
 4803   match(Set dst (VectorInsert (Binary src val) idx));
 4804   effect(TEMP tmp, TEMP vtmp);
 4805   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4806   ins_encode %{
 4807     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4808     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4809 
 4810     uint x_idx = $idx$$constant & right_n_bits(1);
 4811     uint y_idx = ($idx$$constant >> 1) & 3;
 4812     __ movq($tmp$$Register, $val$$XMMRegister);
 4813     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4814     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4815     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4816   %}
 4817   ins_pipe( pipe_slow );
 4818 %}
 4819 
 4820 // ====================REDUCTION ARITHMETIC=======================================
 4821 
 4822 // =======================Int Reduction==========================================
 4823 
 4824 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4825   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4826   match(Set dst (AddReductionVI src1 src2));
 4827   match(Set dst (MulReductionVI src1 src2));
 4828   match(Set dst (AndReductionV  src1 src2));
 4829   match(Set dst ( OrReductionV  src1 src2));
 4830   match(Set dst (XorReductionV  src1 src2));
 4831   match(Set dst (MinReductionV  src1 src2));
 4832   match(Set dst (MaxReductionV  src1 src2));
 4833   effect(TEMP vtmp1, TEMP vtmp2);
 4834   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4835   ins_encode %{
 4836     int opcode = this->ideal_Opcode();
 4837     int vlen = Matcher::vector_length(this, $src2);
 4838     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4839   %}
 4840   ins_pipe( pipe_slow );
 4841 %}
 4842 
 4843 // =======================Long Reduction==========================================
 4844 
 4845 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4846   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4847   match(Set dst (AddReductionVL src1 src2));
 4848   match(Set dst (MulReductionVL src1 src2));
 4849   match(Set dst (AndReductionV  src1 src2));
 4850   match(Set dst ( OrReductionV  src1 src2));
 4851   match(Set dst (XorReductionV  src1 src2));
 4852   match(Set dst (MinReductionV  src1 src2));
 4853   match(Set dst (MaxReductionV  src1 src2));
 4854   effect(TEMP vtmp1, TEMP vtmp2);
 4855   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4856   ins_encode %{
 4857     int opcode = this->ideal_Opcode();
 4858     int vlen = Matcher::vector_length(this, $src2);
 4859     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4860   %}
 4861   ins_pipe( pipe_slow );
 4862 %}
 4863 
 4864 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4865   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4866   match(Set dst (AddReductionVL src1 src2));
 4867   match(Set dst (MulReductionVL src1 src2));
 4868   match(Set dst (AndReductionV  src1 src2));
 4869   match(Set dst ( OrReductionV  src1 src2));
 4870   match(Set dst (XorReductionV  src1 src2));
 4871   match(Set dst (MinReductionV  src1 src2));
 4872   match(Set dst (MaxReductionV  src1 src2));
 4873   effect(TEMP vtmp1, TEMP vtmp2);
 4874   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4875   ins_encode %{
 4876     int opcode = this->ideal_Opcode();
 4877     int vlen = Matcher::vector_length(this, $src2);
 4878     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4879   %}
 4880   ins_pipe( pipe_slow );
 4881 %}
 4882 
 4883 // =======================Float Reduction==========================================
 4884 
 4885 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4886   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 4887   match(Set dst (AddReductionVF dst src));
 4888   match(Set dst (MulReductionVF dst src));
 4889   effect(TEMP dst, TEMP vtmp);
 4890   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4891   ins_encode %{
 4892     int opcode = this->ideal_Opcode();
 4893     int vlen = Matcher::vector_length(this, $src);
 4894     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4895   %}
 4896   ins_pipe( pipe_slow );
 4897 %}
 4898 
 4899 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4900   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 4901   match(Set dst (AddReductionVF dst src));
 4902   match(Set dst (MulReductionVF dst src));
 4903   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4904   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4905   ins_encode %{
 4906     int opcode = this->ideal_Opcode();
 4907     int vlen = Matcher::vector_length(this, $src);
 4908     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4909   %}
 4910   ins_pipe( pipe_slow );
 4911 %}
 4912 
 4913 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4914   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 4915   match(Set dst (AddReductionVF dst src));
 4916   match(Set dst (MulReductionVF dst src));
 4917   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4918   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4919   ins_encode %{
 4920     int opcode = this->ideal_Opcode();
 4921     int vlen = Matcher::vector_length(this, $src);
 4922     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4923   %}
 4924   ins_pipe( pipe_slow );
 4925 %}
 4926 
 4927 
 4928 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 4929   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4930   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4931   // src1 contains reduction identity
 4932   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 4933   match(Set dst (AddReductionVF src1 src2));
 4934   match(Set dst (MulReductionVF src1 src2));
 4935   effect(TEMP dst);
 4936   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 4937   ins_encode %{
 4938     int opcode = this->ideal_Opcode();
 4939     int vlen = Matcher::vector_length(this, $src2);
 4940     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 4941   %}
 4942   ins_pipe( pipe_slow );
 4943 %}
 4944 
 4945 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 4946   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4947   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4948   // src1 contains reduction identity
 4949   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 4950   match(Set dst (AddReductionVF src1 src2));
 4951   match(Set dst (MulReductionVF src1 src2));
 4952   effect(TEMP dst, TEMP vtmp);
 4953   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 4954   ins_encode %{
 4955     int opcode = this->ideal_Opcode();
 4956     int vlen = Matcher::vector_length(this, $src2);
 4957     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 4958   %}
 4959   ins_pipe( pipe_slow );
 4960 %}
 4961 
 4962 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 4963   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4964   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4965   // src1 contains reduction identity
 4966   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 4967   match(Set dst (AddReductionVF src1 src2));
 4968   match(Set dst (MulReductionVF src1 src2));
 4969   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4970   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4971   ins_encode %{
 4972     int opcode = this->ideal_Opcode();
 4973     int vlen = Matcher::vector_length(this, $src2);
 4974     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4975   %}
 4976   ins_pipe( pipe_slow );
 4977 %}
 4978 
 4979 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4980   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4981   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4982   // src1 contains reduction identity
 4983   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 4984   match(Set dst (AddReductionVF src1 src2));
 4985   match(Set dst (MulReductionVF src1 src2));
 4986   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4987   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4988   ins_encode %{
 4989     int opcode = this->ideal_Opcode();
 4990     int vlen = Matcher::vector_length(this, $src2);
 4991     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4992   %}
 4993   ins_pipe( pipe_slow );
 4994 %}
 4995 
 4996 // =======================Double Reduction==========================================
 4997 
 4998 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4999   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5000   match(Set dst (AddReductionVD dst src));
 5001   match(Set dst (MulReductionVD dst src));
 5002   effect(TEMP dst, TEMP vtmp);
 5003   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5004   ins_encode %{
 5005     int opcode = this->ideal_Opcode();
 5006     int vlen = Matcher::vector_length(this, $src);
 5007     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5008 %}
 5009   ins_pipe( pipe_slow );
 5010 %}
 5011 
 5012 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5013   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5014   match(Set dst (AddReductionVD dst src));
 5015   match(Set dst (MulReductionVD dst src));
 5016   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5017   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5018   ins_encode %{
 5019     int opcode = this->ideal_Opcode();
 5020     int vlen = Matcher::vector_length(this, $src);
 5021     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5022   %}
 5023   ins_pipe( pipe_slow );
 5024 %}
 5025 
 5026 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5027   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5028   match(Set dst (AddReductionVD dst src));
 5029   match(Set dst (MulReductionVD dst src));
 5030   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5031   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5032   ins_encode %{
 5033     int opcode = this->ideal_Opcode();
 5034     int vlen = Matcher::vector_length(this, $src);
 5035     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5036   %}
 5037   ins_pipe( pipe_slow );
 5038 %}
 5039 
 5040 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5041   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5042   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5043   // src1 contains reduction identity
 5044   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5045   match(Set dst (AddReductionVD src1 src2));
 5046   match(Set dst (MulReductionVD src1 src2));
 5047   effect(TEMP dst);
 5048   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5049   ins_encode %{
 5050     int opcode = this->ideal_Opcode();
 5051     int vlen = Matcher::vector_length(this, $src2);
 5052     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5053 %}
 5054   ins_pipe( pipe_slow );
 5055 %}
 5056 
 5057 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5058   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5059   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5060   // src1 contains reduction identity
 5061   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5062   match(Set dst (AddReductionVD src1 src2));
 5063   match(Set dst (MulReductionVD src1 src2));
 5064   effect(TEMP dst, TEMP vtmp);
 5065   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5066   ins_encode %{
 5067     int opcode = this->ideal_Opcode();
 5068     int vlen = Matcher::vector_length(this, $src2);
 5069     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5070   %}
 5071   ins_pipe( pipe_slow );
 5072 %}
 5073 
 5074 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5075   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5076   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5077   // src1 contains reduction identity
 5078   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5079   match(Set dst (AddReductionVD src1 src2));
 5080   match(Set dst (MulReductionVD src1 src2));
 5081   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5082   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5083   ins_encode %{
 5084     int opcode = this->ideal_Opcode();
 5085     int vlen = Matcher::vector_length(this, $src2);
 5086     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5087   %}
 5088   ins_pipe( pipe_slow );
 5089 %}
 5090 
 5091 // =======================Byte Reduction==========================================
 5092 
 5093 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5094   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5095   match(Set dst (AddReductionVI src1 src2));
 5096   match(Set dst (AndReductionV  src1 src2));
 5097   match(Set dst ( OrReductionV  src1 src2));
 5098   match(Set dst (XorReductionV  src1 src2));
 5099   match(Set dst (MinReductionV  src1 src2));
 5100   match(Set dst (MaxReductionV  src1 src2));
 5101   effect(TEMP vtmp1, TEMP vtmp2);
 5102   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5103   ins_encode %{
 5104     int opcode = this->ideal_Opcode();
 5105     int vlen = Matcher::vector_length(this, $src2);
 5106     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5107   %}
 5108   ins_pipe( pipe_slow );
 5109 %}
 5110 
 5111 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5112   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5113   match(Set dst (AddReductionVI src1 src2));
 5114   match(Set dst (AndReductionV  src1 src2));
 5115   match(Set dst ( OrReductionV  src1 src2));
 5116   match(Set dst (XorReductionV  src1 src2));
 5117   match(Set dst (MinReductionV  src1 src2));
 5118   match(Set dst (MaxReductionV  src1 src2));
 5119   effect(TEMP vtmp1, TEMP vtmp2);
 5120   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5121   ins_encode %{
 5122     int opcode = this->ideal_Opcode();
 5123     int vlen = Matcher::vector_length(this, $src2);
 5124     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5125   %}
 5126   ins_pipe( pipe_slow );
 5127 %}
 5128 
 5129 // =======================Short Reduction==========================================
 5130 
 5131 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5132   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5133   match(Set dst (AddReductionVI src1 src2));
 5134   match(Set dst (MulReductionVI src1 src2));
 5135   match(Set dst (AndReductionV  src1 src2));
 5136   match(Set dst ( OrReductionV  src1 src2));
 5137   match(Set dst (XorReductionV  src1 src2));
 5138   match(Set dst (MinReductionV  src1 src2));
 5139   match(Set dst (MaxReductionV  src1 src2));
 5140   effect(TEMP vtmp1, TEMP vtmp2);
 5141   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5142   ins_encode %{
 5143     int opcode = this->ideal_Opcode();
 5144     int vlen = Matcher::vector_length(this, $src2);
 5145     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5146   %}
 5147   ins_pipe( pipe_slow );
 5148 %}
 5149 
 5150 // =======================Mul Reduction==========================================
 5151 
 5152 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5153   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5154             Matcher::vector_length(n->in(2)) <= 32); // src2
 5155   match(Set dst (MulReductionVI src1 src2));
 5156   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5157   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5158   ins_encode %{
 5159     int opcode = this->ideal_Opcode();
 5160     int vlen = Matcher::vector_length(this, $src2);
 5161     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5162   %}
 5163   ins_pipe( pipe_slow );
 5164 %}
 5165 
 5166 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5167   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5168             Matcher::vector_length(n->in(2)) == 64); // src2
 5169   match(Set dst (MulReductionVI src1 src2));
 5170   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5171   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5172   ins_encode %{
 5173     int opcode = this->ideal_Opcode();
 5174     int vlen = Matcher::vector_length(this, $src2);
 5175     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5176   %}
 5177   ins_pipe( pipe_slow );
 5178 %}
 5179 
 5180 //--------------------Min/Max Float Reduction --------------------
 5181 // Float Min Reduction
 5182 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5183                             legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5184   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5185             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5186              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5187             Matcher::vector_length(n->in(2)) == 2);
 5188   match(Set dst (MinReductionV src1 src2));
 5189   match(Set dst (MaxReductionV src1 src2));
 5190   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5191   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5192   ins_encode %{
 5193     assert(UseAVX > 0, "sanity");
 5194 
 5195     int opcode = this->ideal_Opcode();
 5196     int vlen = Matcher::vector_length(this, $src2);
 5197     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5198                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5199   %}
 5200   ins_pipe( pipe_slow );
 5201 %}
 5202 
 5203 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5204                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5205   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5206             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5207              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5208             Matcher::vector_length(n->in(2)) >= 4);
 5209   match(Set dst (MinReductionV src1 src2));
 5210   match(Set dst (MaxReductionV src1 src2));
 5211   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5212   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5213   ins_encode %{
 5214     assert(UseAVX > 0, "sanity");
 5215 
 5216     int opcode = this->ideal_Opcode();
 5217     int vlen = Matcher::vector_length(this, $src2);
 5218     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5219                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5220   %}
 5221   ins_pipe( pipe_slow );
 5222 %}
 5223 
 5224 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp, legVec atmp,
 5225                                legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5226   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5227             Matcher::vector_length(n->in(2)) == 2);
 5228   match(Set dst (MinReductionV dst src));
 5229   match(Set dst (MaxReductionV dst src));
 5230   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5231   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5232   ins_encode %{
 5233     assert(UseAVX > 0, "sanity");
 5234 
 5235     int opcode = this->ideal_Opcode();
 5236     int vlen = Matcher::vector_length(this, $src);
 5237     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5238                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5239   %}
 5240   ins_pipe( pipe_slow );
 5241 %}
 5242 
 5243 
 5244 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp, legVec atmp, legVec btmp,
 5245                               legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5246   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5247             Matcher::vector_length(n->in(2)) >= 4);
 5248   match(Set dst (MinReductionV dst src));
 5249   match(Set dst (MaxReductionV dst src));
 5250   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5251   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5252   ins_encode %{
 5253     assert(UseAVX > 0, "sanity");
 5254 
 5255     int opcode = this->ideal_Opcode();
 5256     int vlen = Matcher::vector_length(this, $src);
 5257     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5258                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5259   %}
 5260   ins_pipe( pipe_slow );
 5261 %}
 5262 
 5263 instruct minmax_reduction2F_avx10(regF dst, immF src1, vec src2, vec xtmp1) %{
 5264   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5265             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5266              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5267             Matcher::vector_length(n->in(2)) == 2);
 5268   match(Set dst (MinReductionV src1 src2));
 5269   match(Set dst (MaxReductionV src1 src2));
 5270   effect(TEMP dst, TEMP xtmp1);
 5271   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 as TEMP" %}
 5272   ins_encode %{
 5273     int opcode = this->ideal_Opcode();
 5274     int vlen = Matcher::vector_length(this, $src2);
 5275     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5276                          xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5277   %}
 5278   ins_pipe( pipe_slow );
 5279 %}
 5280 
 5281 instruct minmax_reductionF_avx10(regF dst, immF src1, vec src2, vec xtmp1, vec xtmp2) %{
 5282   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5283             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5284              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5285             Matcher::vector_length(n->in(2)) >= 4);
 5286   match(Set dst (MinReductionV src1 src2));
 5287   match(Set dst (MaxReductionV src1 src2));
 5288   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5289   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5290   ins_encode %{
 5291     int opcode = this->ideal_Opcode();
 5292     int vlen = Matcher::vector_length(this, $src2);
 5293     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5294                          xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5295   %}
 5296   ins_pipe( pipe_slow );
 5297 %}
 5298 
 5299 instruct minmax_reduction2F_avx10_av(regF dst, vec src, vec xtmp1) %{
 5300   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5301             Matcher::vector_length(n->in(2)) == 2);
 5302   match(Set dst (MinReductionV dst src));
 5303   match(Set dst (MaxReductionV dst src));
 5304   effect(TEMP dst, TEMP xtmp1);
 5305   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 as TEMP" %}
 5306   ins_encode %{
 5307     int opcode = this->ideal_Opcode();
 5308     int vlen = Matcher::vector_length(this, $src);
 5309     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5310                          $xtmp1$$XMMRegister);
 5311   %}
 5312   ins_pipe( pipe_slow );
 5313 %}
 5314 
 5315 instruct minmax_reductionF_avx10_av(regF dst, vec src, vec xtmp1, vec xtmp2) %{
 5316   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5317             Matcher::vector_length(n->in(2)) >= 4);
 5318   match(Set dst (MinReductionV dst src));
 5319   match(Set dst (MaxReductionV dst src));
 5320   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5321   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5322   ins_encode %{
 5323     int opcode = this->ideal_Opcode();
 5324     int vlen = Matcher::vector_length(this, $src);
 5325     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5326                          $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5327   %}
 5328   ins_pipe( pipe_slow );
 5329 %}
 5330 
 5331 //--------------------Min Double Reduction --------------------
 5332 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5333                             legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5334   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5335             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5336              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5337             Matcher::vector_length(n->in(2)) == 2);
 5338   match(Set dst (MinReductionV src1 src2));
 5339   match(Set dst (MaxReductionV src1 src2));
 5340   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5341   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5342   ins_encode %{
 5343     assert(UseAVX > 0, "sanity");
 5344 
 5345     int opcode = this->ideal_Opcode();
 5346     int vlen = Matcher::vector_length(this, $src2);
 5347     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5348                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5349   %}
 5350   ins_pipe( pipe_slow );
 5351 %}
 5352 
 5353 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5354                            legVec tmp3, legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5355   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5356             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5357              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5358             Matcher::vector_length(n->in(2)) >= 4);
 5359   match(Set dst (MinReductionV src1 src2));
 5360   match(Set dst (MaxReductionV src1 src2));
 5361   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5362   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5363   ins_encode %{
 5364     assert(UseAVX > 0, "sanity");
 5365 
 5366     int opcode = this->ideal_Opcode();
 5367     int vlen = Matcher::vector_length(this, $src2);
 5368     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5369                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5370   %}
 5371   ins_pipe( pipe_slow );
 5372 %}
 5373 
 5374 
 5375 instruct minmax_reduction2D_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2,
 5376                                legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5377   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5378             Matcher::vector_length(n->in(2)) == 2);
 5379   match(Set dst (MinReductionV dst src));
 5380   match(Set dst (MaxReductionV dst src));
 5381   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5382   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5383   ins_encode %{
 5384     assert(UseAVX > 0, "sanity");
 5385 
 5386     int opcode = this->ideal_Opcode();
 5387     int vlen = Matcher::vector_length(this, $src);
 5388     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5389                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5390   %}
 5391   ins_pipe( pipe_slow );
 5392 %}
 5393 
 5394 instruct minmax_reductionD_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2, legVec tmp3,
 5395                               legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5396   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5397             Matcher::vector_length(n->in(2)) >= 4);
 5398   match(Set dst (MinReductionV dst src));
 5399   match(Set dst (MaxReductionV dst src));
 5400   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5401   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5402   ins_encode %{
 5403     assert(UseAVX > 0, "sanity");
 5404 
 5405     int opcode = this->ideal_Opcode();
 5406     int vlen = Matcher::vector_length(this, $src);
 5407     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5408                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5409   %}
 5410   ins_pipe( pipe_slow );
 5411 %}
 5412 
 5413 instruct minmax_reduction2D_avx10(regD dst, immD src1, vec src2, vec xtmp1) %{
 5414   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5415             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5416              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5417             Matcher::vector_length(n->in(2)) == 2);
 5418   match(Set dst (MinReductionV src1 src2));
 5419   match(Set dst (MaxReductionV src1 src2));
 5420   effect(TEMP dst, TEMP xtmp1);
 5421   format %{ "vector_minmax2D_reduction $dst, $src1, $src2 ; using $xtmp1 as TEMP" %}
 5422   ins_encode %{
 5423     int opcode = this->ideal_Opcode();
 5424     int vlen = Matcher::vector_length(this, $src2);
 5425     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg,
 5426                           xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5427   %}
 5428   ins_pipe( pipe_slow );
 5429 %}
 5430 
 5431 instruct minmax_reductionD_avx10(regD dst, immD src1, vec src2, vec xtmp1, vec xtmp2) %{
 5432   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5433             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5434              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5435             Matcher::vector_length(n->in(2)) >= 4);
 5436   match(Set dst (MinReductionV src1 src2));
 5437   match(Set dst (MaxReductionV src1 src2));
 5438   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5439   format %{ "vector_minmaxD_reduction $dst, $src1, $src2 ; using $xtmp1 and $xtmp2 as TEMP" %}
 5440   ins_encode %{
 5441     int opcode = this->ideal_Opcode();
 5442     int vlen = Matcher::vector_length(this, $src2);
 5443     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5444                           xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5445   %}
 5446   ins_pipe( pipe_slow );
 5447 %}
 5448 
 5449 
 5450 instruct minmax_reduction2D_av_avx10(regD dst, vec src, vec xtmp1) %{
 5451   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5452             Matcher::vector_length(n->in(2)) == 2);
 5453   match(Set dst (MinReductionV dst src));
 5454   match(Set dst (MaxReductionV dst src));
 5455   effect(TEMP dst, TEMP xtmp1);
 5456   format %{ "vector_minmax2D_reduction $dst, $src ; using $xtmp1 as TEMP" %}
 5457   ins_encode %{
 5458     int opcode = this->ideal_Opcode();
 5459     int vlen = Matcher::vector_length(this, $src);
 5460     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5461                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5462   %}
 5463   ins_pipe( pipe_slow );
 5464 %}
 5465 
 5466 instruct minmax_reductionD_av_avx10(regD dst, vec src, vec xtmp1, vec xtmp2) %{
 5467   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5468             Matcher::vector_length(n->in(2)) >= 4);
 5469   match(Set dst (MinReductionV dst src));
 5470   match(Set dst (MaxReductionV dst src));
 5471   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5472   format %{ "vector_minmaxD_reduction $dst, $src ; using $xtmp1 and $xtmp2 as TEMP" %}
 5473   ins_encode %{
 5474     int opcode = this->ideal_Opcode();
 5475     int vlen = Matcher::vector_length(this, $src);
 5476     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5477                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5478   %}
 5479   ins_pipe( pipe_slow );
 5480 %}
 5481 
 5482 // ====================VECTOR ARITHMETIC=======================================
 5483 
 5484 // --------------------------------- ADD --------------------------------------
 5485 
 5486 // Bytes vector add
 5487 instruct vaddB(vec dst, vec src) %{
 5488   predicate(UseAVX == 0);
 5489   match(Set dst (AddVB dst src));
 5490   format %{ "paddb   $dst,$src\t! add packedB" %}
 5491   ins_encode %{
 5492     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5493   %}
 5494   ins_pipe( pipe_slow );
 5495 %}
 5496 
 5497 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5498   predicate(UseAVX > 0);
 5499   match(Set dst (AddVB src1 src2));
 5500   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5501   ins_encode %{
 5502     int vlen_enc = vector_length_encoding(this);
 5503     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5504   %}
 5505   ins_pipe( pipe_slow );
 5506 %}
 5507 
 5508 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5509   predicate((UseAVX > 0) &&
 5510             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5511   match(Set dst (AddVB src (LoadVector mem)));
 5512   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5513   ins_encode %{
 5514     int vlen_enc = vector_length_encoding(this);
 5515     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5516   %}
 5517   ins_pipe( pipe_slow );
 5518 %}
 5519 
 5520 // Shorts/Chars vector add
 5521 instruct vaddS(vec dst, vec src) %{
 5522   predicate(UseAVX == 0);
 5523   match(Set dst (AddVS dst src));
 5524   format %{ "paddw   $dst,$src\t! add packedS" %}
 5525   ins_encode %{
 5526     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5527   %}
 5528   ins_pipe( pipe_slow );
 5529 %}
 5530 
 5531 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5532   predicate(UseAVX > 0);
 5533   match(Set dst (AddVS src1 src2));
 5534   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5535   ins_encode %{
 5536     int vlen_enc = vector_length_encoding(this);
 5537     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5538   %}
 5539   ins_pipe( pipe_slow );
 5540 %}
 5541 
 5542 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5543   predicate((UseAVX > 0) &&
 5544             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5545   match(Set dst (AddVS src (LoadVector mem)));
 5546   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5547   ins_encode %{
 5548     int vlen_enc = vector_length_encoding(this);
 5549     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5550   %}
 5551   ins_pipe( pipe_slow );
 5552 %}
 5553 
 5554 // Integers vector add
 5555 instruct vaddI(vec dst, vec src) %{
 5556   predicate(UseAVX == 0);
 5557   match(Set dst (AddVI dst src));
 5558   format %{ "paddd   $dst,$src\t! add packedI" %}
 5559   ins_encode %{
 5560     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5561   %}
 5562   ins_pipe( pipe_slow );
 5563 %}
 5564 
 5565 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5566   predicate(UseAVX > 0);
 5567   match(Set dst (AddVI src1 src2));
 5568   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5569   ins_encode %{
 5570     int vlen_enc = vector_length_encoding(this);
 5571     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5572   %}
 5573   ins_pipe( pipe_slow );
 5574 %}
 5575 
 5576 
 5577 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5578   predicate((UseAVX > 0) &&
 5579             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5580   match(Set dst (AddVI src (LoadVector mem)));
 5581   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5582   ins_encode %{
 5583     int vlen_enc = vector_length_encoding(this);
 5584     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5585   %}
 5586   ins_pipe( pipe_slow );
 5587 %}
 5588 
 5589 // Longs vector add
 5590 instruct vaddL(vec dst, vec src) %{
 5591   predicate(UseAVX == 0);
 5592   match(Set dst (AddVL dst src));
 5593   format %{ "paddq   $dst,$src\t! add packedL" %}
 5594   ins_encode %{
 5595     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5596   %}
 5597   ins_pipe( pipe_slow );
 5598 %}
 5599 
 5600 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5601   predicate(UseAVX > 0);
 5602   match(Set dst (AddVL src1 src2));
 5603   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5604   ins_encode %{
 5605     int vlen_enc = vector_length_encoding(this);
 5606     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5607   %}
 5608   ins_pipe( pipe_slow );
 5609 %}
 5610 
 5611 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5612   predicate((UseAVX > 0) &&
 5613             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5614   match(Set dst (AddVL src (LoadVector mem)));
 5615   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5616   ins_encode %{
 5617     int vlen_enc = vector_length_encoding(this);
 5618     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5619   %}
 5620   ins_pipe( pipe_slow );
 5621 %}
 5622 
 5623 // Floats vector add
 5624 instruct vaddF(vec dst, vec src) %{
 5625   predicate(UseAVX == 0);
 5626   match(Set dst (AddVF dst src));
 5627   format %{ "addps   $dst,$src\t! add packedF" %}
 5628   ins_encode %{
 5629     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5630   %}
 5631   ins_pipe( pipe_slow );
 5632 %}
 5633 
 5634 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5635   predicate(UseAVX > 0);
 5636   match(Set dst (AddVF src1 src2));
 5637   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5638   ins_encode %{
 5639     int vlen_enc = vector_length_encoding(this);
 5640     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5641   %}
 5642   ins_pipe( pipe_slow );
 5643 %}
 5644 
 5645 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5646   predicate((UseAVX > 0) &&
 5647             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5648   match(Set dst (AddVF src (LoadVector mem)));
 5649   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5650   ins_encode %{
 5651     int vlen_enc = vector_length_encoding(this);
 5652     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5653   %}
 5654   ins_pipe( pipe_slow );
 5655 %}
 5656 
 5657 // Doubles vector add
 5658 instruct vaddD(vec dst, vec src) %{
 5659   predicate(UseAVX == 0);
 5660   match(Set dst (AddVD dst src));
 5661   format %{ "addpd   $dst,$src\t! add packedD" %}
 5662   ins_encode %{
 5663     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5664   %}
 5665   ins_pipe( pipe_slow );
 5666 %}
 5667 
 5668 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5669   predicate(UseAVX > 0);
 5670   match(Set dst (AddVD src1 src2));
 5671   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5672   ins_encode %{
 5673     int vlen_enc = vector_length_encoding(this);
 5674     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5675   %}
 5676   ins_pipe( pipe_slow );
 5677 %}
 5678 
 5679 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5680   predicate((UseAVX > 0) &&
 5681             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5682   match(Set dst (AddVD src (LoadVector mem)));
 5683   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5684   ins_encode %{
 5685     int vlen_enc = vector_length_encoding(this);
 5686     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5687   %}
 5688   ins_pipe( pipe_slow );
 5689 %}
 5690 
 5691 // --------------------------------- SUB --------------------------------------
 5692 
 5693 // Bytes vector sub
 5694 instruct vsubB(vec dst, vec src) %{
 5695   predicate(UseAVX == 0);
 5696   match(Set dst (SubVB dst src));
 5697   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5698   ins_encode %{
 5699     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5700   %}
 5701   ins_pipe( pipe_slow );
 5702 %}
 5703 
 5704 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5705   predicate(UseAVX > 0);
 5706   match(Set dst (SubVB src1 src2));
 5707   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5708   ins_encode %{
 5709     int vlen_enc = vector_length_encoding(this);
 5710     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5711   %}
 5712   ins_pipe( pipe_slow );
 5713 %}
 5714 
 5715 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5716   predicate((UseAVX > 0) &&
 5717             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5718   match(Set dst (SubVB src (LoadVector mem)));
 5719   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5720   ins_encode %{
 5721     int vlen_enc = vector_length_encoding(this);
 5722     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5723   %}
 5724   ins_pipe( pipe_slow );
 5725 %}
 5726 
 5727 // Shorts/Chars vector sub
 5728 instruct vsubS(vec dst, vec src) %{
 5729   predicate(UseAVX == 0);
 5730   match(Set dst (SubVS dst src));
 5731   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5732   ins_encode %{
 5733     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5734   %}
 5735   ins_pipe( pipe_slow );
 5736 %}
 5737 
 5738 
 5739 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5740   predicate(UseAVX > 0);
 5741   match(Set dst (SubVS src1 src2));
 5742   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5743   ins_encode %{
 5744     int vlen_enc = vector_length_encoding(this);
 5745     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5746   %}
 5747   ins_pipe( pipe_slow );
 5748 %}
 5749 
 5750 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5751   predicate((UseAVX > 0) &&
 5752             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5753   match(Set dst (SubVS src (LoadVector mem)));
 5754   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5755   ins_encode %{
 5756     int vlen_enc = vector_length_encoding(this);
 5757     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5758   %}
 5759   ins_pipe( pipe_slow );
 5760 %}
 5761 
 5762 // Integers vector sub
 5763 instruct vsubI(vec dst, vec src) %{
 5764   predicate(UseAVX == 0);
 5765   match(Set dst (SubVI dst src));
 5766   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5767   ins_encode %{
 5768     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5769   %}
 5770   ins_pipe( pipe_slow );
 5771 %}
 5772 
 5773 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5774   predicate(UseAVX > 0);
 5775   match(Set dst (SubVI src1 src2));
 5776   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5777   ins_encode %{
 5778     int vlen_enc = vector_length_encoding(this);
 5779     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5780   %}
 5781   ins_pipe( pipe_slow );
 5782 %}
 5783 
 5784 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5785   predicate((UseAVX > 0) &&
 5786             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5787   match(Set dst (SubVI src (LoadVector mem)));
 5788   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5789   ins_encode %{
 5790     int vlen_enc = vector_length_encoding(this);
 5791     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5792   %}
 5793   ins_pipe( pipe_slow );
 5794 %}
 5795 
 5796 // Longs vector sub
 5797 instruct vsubL(vec dst, vec src) %{
 5798   predicate(UseAVX == 0);
 5799   match(Set dst (SubVL dst src));
 5800   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5801   ins_encode %{
 5802     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5803   %}
 5804   ins_pipe( pipe_slow );
 5805 %}
 5806 
 5807 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5808   predicate(UseAVX > 0);
 5809   match(Set dst (SubVL src1 src2));
 5810   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5811   ins_encode %{
 5812     int vlen_enc = vector_length_encoding(this);
 5813     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5814   %}
 5815   ins_pipe( pipe_slow );
 5816 %}
 5817 
 5818 
 5819 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5820   predicate((UseAVX > 0) &&
 5821             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5822   match(Set dst (SubVL src (LoadVector mem)));
 5823   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5824   ins_encode %{
 5825     int vlen_enc = vector_length_encoding(this);
 5826     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5827   %}
 5828   ins_pipe( pipe_slow );
 5829 %}
 5830 
 5831 // Floats vector sub
 5832 instruct vsubF(vec dst, vec src) %{
 5833   predicate(UseAVX == 0);
 5834   match(Set dst (SubVF dst src));
 5835   format %{ "subps   $dst,$src\t! sub packedF" %}
 5836   ins_encode %{
 5837     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5838   %}
 5839   ins_pipe( pipe_slow );
 5840 %}
 5841 
 5842 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5843   predicate(UseAVX > 0);
 5844   match(Set dst (SubVF src1 src2));
 5845   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5846   ins_encode %{
 5847     int vlen_enc = vector_length_encoding(this);
 5848     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5849   %}
 5850   ins_pipe( pipe_slow );
 5851 %}
 5852 
 5853 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5854   predicate((UseAVX > 0) &&
 5855             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5856   match(Set dst (SubVF src (LoadVector mem)));
 5857   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5858   ins_encode %{
 5859     int vlen_enc = vector_length_encoding(this);
 5860     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5861   %}
 5862   ins_pipe( pipe_slow );
 5863 %}
 5864 
 5865 // Doubles vector sub
 5866 instruct vsubD(vec dst, vec src) %{
 5867   predicate(UseAVX == 0);
 5868   match(Set dst (SubVD dst src));
 5869   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5870   ins_encode %{
 5871     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5872   %}
 5873   ins_pipe( pipe_slow );
 5874 %}
 5875 
 5876 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5877   predicate(UseAVX > 0);
 5878   match(Set dst (SubVD src1 src2));
 5879   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5880   ins_encode %{
 5881     int vlen_enc = vector_length_encoding(this);
 5882     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5883   %}
 5884   ins_pipe( pipe_slow );
 5885 %}
 5886 
 5887 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5888   predicate((UseAVX > 0) &&
 5889             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5890   match(Set dst (SubVD src (LoadVector mem)));
 5891   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5892   ins_encode %{
 5893     int vlen_enc = vector_length_encoding(this);
 5894     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5895   %}
 5896   ins_pipe( pipe_slow );
 5897 %}
 5898 
 5899 // --------------------------------- MUL --------------------------------------
 5900 
 5901 // Byte vector mul
 5902 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5903   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5904   match(Set dst (MulVB src1 src2));
 5905   effect(TEMP dst, TEMP xtmp);
 5906   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5907   ins_encode %{
 5908     assert(UseSSE > 3, "required");
 5909     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5910     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5911     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5912     __ psllw($dst$$XMMRegister, 8);
 5913     __ psrlw($dst$$XMMRegister, 8);
 5914     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5915   %}
 5916   ins_pipe( pipe_slow );
 5917 %}
 5918 
 5919 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5920   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5921   match(Set dst (MulVB src1 src2));
 5922   effect(TEMP dst, TEMP xtmp);
 5923   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5924   ins_encode %{
 5925     assert(UseSSE > 3, "required");
 5926     // Odd-index elements
 5927     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5928     __ psrlw($dst$$XMMRegister, 8);
 5929     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5930     __ psrlw($xtmp$$XMMRegister, 8);
 5931     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5932     __ psllw($dst$$XMMRegister, 8);
 5933     // Even-index elements
 5934     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5935     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5936     __ psllw($xtmp$$XMMRegister, 8);
 5937     __ psrlw($xtmp$$XMMRegister, 8);
 5938     // Combine
 5939     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5940   %}
 5941   ins_pipe( pipe_slow );
 5942 %}
 5943 
 5944 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5945   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5946   match(Set dst (MulVB src1 src2));
 5947   effect(TEMP xtmp1, TEMP xtmp2);
 5948   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5949   ins_encode %{
 5950     int vlen_enc = vector_length_encoding(this);
 5951     // Odd-index elements
 5952     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5953     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5954     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5955     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5956     // Even-index elements
 5957     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5958     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5959     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5960     // Combine
 5961     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5962   %}
 5963   ins_pipe( pipe_slow );
 5964 %}
 5965 
 5966 // Shorts/Chars vector mul
 5967 instruct vmulS(vec dst, vec src) %{
 5968   predicate(UseAVX == 0);
 5969   match(Set dst (MulVS dst src));
 5970   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5971   ins_encode %{
 5972     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5973   %}
 5974   ins_pipe( pipe_slow );
 5975 %}
 5976 
 5977 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5978   predicate(UseAVX > 0);
 5979   match(Set dst (MulVS src1 src2));
 5980   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5981   ins_encode %{
 5982     int vlen_enc = vector_length_encoding(this);
 5983     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5984   %}
 5985   ins_pipe( pipe_slow );
 5986 %}
 5987 
 5988 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5989   predicate((UseAVX > 0) &&
 5990             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5991   match(Set dst (MulVS src (LoadVector mem)));
 5992   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5993   ins_encode %{
 5994     int vlen_enc = vector_length_encoding(this);
 5995     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5996   %}
 5997   ins_pipe( pipe_slow );
 5998 %}
 5999 
 6000 // Integers vector mul
 6001 instruct vmulI(vec dst, vec src) %{
 6002   predicate(UseAVX == 0);
 6003   match(Set dst (MulVI dst src));
 6004   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6005   ins_encode %{
 6006     assert(UseSSE > 3, "required");
 6007     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6008   %}
 6009   ins_pipe( pipe_slow );
 6010 %}
 6011 
 6012 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6013   predicate(UseAVX > 0);
 6014   match(Set dst (MulVI src1 src2));
 6015   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6016   ins_encode %{
 6017     int vlen_enc = vector_length_encoding(this);
 6018     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6019   %}
 6020   ins_pipe( pipe_slow );
 6021 %}
 6022 
 6023 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6024   predicate((UseAVX > 0) &&
 6025             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6026   match(Set dst (MulVI src (LoadVector mem)));
 6027   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6028   ins_encode %{
 6029     int vlen_enc = vector_length_encoding(this);
 6030     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6031   %}
 6032   ins_pipe( pipe_slow );
 6033 %}
 6034 
 6035 // Longs vector mul
 6036 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6037   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6038              VM_Version::supports_avx512dq()) ||
 6039             VM_Version::supports_avx512vldq());
 6040   match(Set dst (MulVL src1 src2));
 6041   ins_cost(500);
 6042   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6043   ins_encode %{
 6044     assert(UseAVX > 2, "required");
 6045     int vlen_enc = vector_length_encoding(this);
 6046     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6047   %}
 6048   ins_pipe( pipe_slow );
 6049 %}
 6050 
 6051 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6052   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6053              VM_Version::supports_avx512dq()) ||
 6054             (Matcher::vector_length_in_bytes(n) > 8 &&
 6055              VM_Version::supports_avx512vldq()));
 6056   match(Set dst (MulVL src (LoadVector mem)));
 6057   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6058   ins_cost(500);
 6059   ins_encode %{
 6060     assert(UseAVX > 2, "required");
 6061     int vlen_enc = vector_length_encoding(this);
 6062     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6063   %}
 6064   ins_pipe( pipe_slow );
 6065 %}
 6066 
 6067 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6068   predicate(UseAVX == 0);
 6069   match(Set dst (MulVL src1 src2));
 6070   ins_cost(500);
 6071   effect(TEMP dst, TEMP xtmp);
 6072   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6073   ins_encode %{
 6074     assert(VM_Version::supports_sse4_1(), "required");
 6075     // Get the lo-hi products, only the lower 32 bits is in concerns
 6076     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6077     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6078     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6079     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6080     __ psllq($dst$$XMMRegister, 32);
 6081     // Get the lo-lo products
 6082     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6083     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6084     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6085   %}
 6086   ins_pipe( pipe_slow );
 6087 %}
 6088 
 6089 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6090   predicate(UseAVX > 0 &&
 6091             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6092               !VM_Version::supports_avx512dq()) ||
 6093              (Matcher::vector_length_in_bytes(n) < 64 &&
 6094               !VM_Version::supports_avx512vldq())));
 6095   match(Set dst (MulVL src1 src2));
 6096   effect(TEMP xtmp1, TEMP xtmp2);
 6097   ins_cost(500);
 6098   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6099   ins_encode %{
 6100     int vlen_enc = vector_length_encoding(this);
 6101     // Get the lo-hi products, only the lower 32 bits is in concerns
 6102     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6103     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6104     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6105     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6106     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6107     // Get the lo-lo products
 6108     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6109     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6110   %}
 6111   ins_pipe( pipe_slow );
 6112 %}
 6113 
 6114 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6115   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6116   match(Set dst (MulVL src1 src2));
 6117   ins_cost(100);
 6118   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6119   ins_encode %{
 6120     int vlen_enc = vector_length_encoding(this);
 6121     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6122   %}
 6123   ins_pipe( pipe_slow );
 6124 %}
 6125 
 6126 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6127   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6128   match(Set dst (MulVL src1 src2));
 6129   ins_cost(100);
 6130   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6131   ins_encode %{
 6132     int vlen_enc = vector_length_encoding(this);
 6133     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6134   %}
 6135   ins_pipe( pipe_slow );
 6136 %}
 6137 
 6138 // Floats vector mul
 6139 instruct vmulF(vec dst, vec src) %{
 6140   predicate(UseAVX == 0);
 6141   match(Set dst (MulVF dst src));
 6142   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6143   ins_encode %{
 6144     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6145   %}
 6146   ins_pipe( pipe_slow );
 6147 %}
 6148 
 6149 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6150   predicate(UseAVX > 0);
 6151   match(Set dst (MulVF src1 src2));
 6152   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6153   ins_encode %{
 6154     int vlen_enc = vector_length_encoding(this);
 6155     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6156   %}
 6157   ins_pipe( pipe_slow );
 6158 %}
 6159 
 6160 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6161   predicate((UseAVX > 0) &&
 6162             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6163   match(Set dst (MulVF src (LoadVector mem)));
 6164   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6165   ins_encode %{
 6166     int vlen_enc = vector_length_encoding(this);
 6167     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6168   %}
 6169   ins_pipe( pipe_slow );
 6170 %}
 6171 
 6172 // Doubles vector mul
 6173 instruct vmulD(vec dst, vec src) %{
 6174   predicate(UseAVX == 0);
 6175   match(Set dst (MulVD dst src));
 6176   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6177   ins_encode %{
 6178     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6179   %}
 6180   ins_pipe( pipe_slow );
 6181 %}
 6182 
 6183 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6184   predicate(UseAVX > 0);
 6185   match(Set dst (MulVD src1 src2));
 6186   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6187   ins_encode %{
 6188     int vlen_enc = vector_length_encoding(this);
 6189     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6190   %}
 6191   ins_pipe( pipe_slow );
 6192 %}
 6193 
 6194 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6195   predicate((UseAVX > 0) &&
 6196             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6197   match(Set dst (MulVD src (LoadVector mem)));
 6198   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6199   ins_encode %{
 6200     int vlen_enc = vector_length_encoding(this);
 6201     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6202   %}
 6203   ins_pipe( pipe_slow );
 6204 %}
 6205 
 6206 // --------------------------------- DIV --------------------------------------
 6207 
 6208 // Floats vector div
 6209 instruct vdivF(vec dst, vec src) %{
 6210   predicate(UseAVX == 0);
 6211   match(Set dst (DivVF dst src));
 6212   format %{ "divps   $dst,$src\t! div packedF" %}
 6213   ins_encode %{
 6214     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6215   %}
 6216   ins_pipe( pipe_slow );
 6217 %}
 6218 
 6219 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6220   predicate(UseAVX > 0);
 6221   match(Set dst (DivVF src1 src2));
 6222   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6223   ins_encode %{
 6224     int vlen_enc = vector_length_encoding(this);
 6225     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6226   %}
 6227   ins_pipe( pipe_slow );
 6228 %}
 6229 
 6230 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6231   predicate((UseAVX > 0) &&
 6232             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6233   match(Set dst (DivVF src (LoadVector mem)));
 6234   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6235   ins_encode %{
 6236     int vlen_enc = vector_length_encoding(this);
 6237     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6238   %}
 6239   ins_pipe( pipe_slow );
 6240 %}
 6241 
 6242 // Doubles vector div
 6243 instruct vdivD(vec dst, vec src) %{
 6244   predicate(UseAVX == 0);
 6245   match(Set dst (DivVD dst src));
 6246   format %{ "divpd   $dst,$src\t! div packedD" %}
 6247   ins_encode %{
 6248     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6249   %}
 6250   ins_pipe( pipe_slow );
 6251 %}
 6252 
 6253 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6254   predicate(UseAVX > 0);
 6255   match(Set dst (DivVD src1 src2));
 6256   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6257   ins_encode %{
 6258     int vlen_enc = vector_length_encoding(this);
 6259     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6260   %}
 6261   ins_pipe( pipe_slow );
 6262 %}
 6263 
 6264 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6265   predicate((UseAVX > 0) &&
 6266             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6267   match(Set dst (DivVD src (LoadVector mem)));
 6268   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6269   ins_encode %{
 6270     int vlen_enc = vector_length_encoding(this);
 6271     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6272   %}
 6273   ins_pipe( pipe_slow );
 6274 %}
 6275 
 6276 // ------------------------------ MinMax ---------------------------------------
 6277 
 6278 // Byte, Short, Int vector Min/Max
 6279 instruct minmax_reg_sse(vec dst, vec src) %{
 6280   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6281             UseAVX == 0);
 6282   match(Set dst (MinV dst src));
 6283   match(Set dst (MaxV dst src));
 6284   format %{ "vector_minmax  $dst,$src\t!  " %}
 6285   ins_encode %{
 6286     assert(UseSSE >= 4, "required");
 6287 
 6288     int opcode = this->ideal_Opcode();
 6289     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6290     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6291   %}
 6292   ins_pipe( pipe_slow );
 6293 %}
 6294 
 6295 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6296   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6297             UseAVX > 0);
 6298   match(Set dst (MinV src1 src2));
 6299   match(Set dst (MaxV src1 src2));
 6300   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6301   ins_encode %{
 6302     int opcode = this->ideal_Opcode();
 6303     int vlen_enc = vector_length_encoding(this);
 6304     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6305 
 6306     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6307   %}
 6308   ins_pipe( pipe_slow );
 6309 %}
 6310 
 6311 // Long vector Min/Max
 6312 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6313   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6314             UseAVX == 0);
 6315   match(Set dst (MinV dst src));
 6316   match(Set dst (MaxV src dst));
 6317   effect(TEMP dst, TEMP tmp);
 6318   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6319   ins_encode %{
 6320     assert(UseSSE >= 4, "required");
 6321 
 6322     int opcode = this->ideal_Opcode();
 6323     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6324     assert(elem_bt == T_LONG, "sanity");
 6325 
 6326     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6327   %}
 6328   ins_pipe( pipe_slow );
 6329 %}
 6330 
 6331 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6332   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6333             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6334   match(Set dst (MinV src1 src2));
 6335   match(Set dst (MaxV src1 src2));
 6336   effect(TEMP dst);
 6337   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6338   ins_encode %{
 6339     int vlen_enc = vector_length_encoding(this);
 6340     int opcode = this->ideal_Opcode();
 6341     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6342     assert(elem_bt == T_LONG, "sanity");
 6343 
 6344     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6345   %}
 6346   ins_pipe( pipe_slow );
 6347 %}
 6348 
 6349 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6350   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6351             Matcher::vector_element_basic_type(n) == T_LONG);
 6352   match(Set dst (MinV src1 src2));
 6353   match(Set dst (MaxV src1 src2));
 6354   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6355   ins_encode %{
 6356     assert(UseAVX > 2, "required");
 6357 
 6358     int vlen_enc = vector_length_encoding(this);
 6359     int opcode = this->ideal_Opcode();
 6360     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6361     assert(elem_bt == T_LONG, "sanity");
 6362 
 6363     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6364   %}
 6365   ins_pipe( pipe_slow );
 6366 %}
 6367 
 6368 // Float/Double vector Min/Max
 6369 instruct minmaxFP_avx10_reg(vec dst, vec a, vec b) %{
 6370   predicate(VM_Version::supports_avx10_2() &&
 6371             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6372   match(Set dst (MinV a b));
 6373   match(Set dst (MaxV a b));
 6374   format %{ "vector_minmaxFP  $dst, $a, $b" %}
 6375   ins_encode %{
 6376     int vlen_enc = vector_length_encoding(this);
 6377     int opcode = this->ideal_Opcode();
 6378     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6379     __ vminmax_fp(opcode, elem_bt, $dst$$XMMRegister, k0, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6380   %}
 6381   ins_pipe( pipe_slow );
 6382 %}
 6383 
 6384 // Float/Double vector Min/Max
 6385 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6386   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) <= 32 &&
 6387             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6388             UseAVX > 0);
 6389   match(Set dst (MinV a b));
 6390   match(Set dst (MaxV a b));
 6391   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6392   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6393   ins_encode %{
 6394     assert(UseAVX > 0, "required");
 6395 
 6396     int opcode = this->ideal_Opcode();
 6397     int vlen_enc = vector_length_encoding(this);
 6398     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6399 
 6400     __ vminmax_fp(opcode, elem_bt,
 6401                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6402                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6403   %}
 6404   ins_pipe( pipe_slow );
 6405 %}
 6406 
 6407 instruct evminmaxFP_reg_evex(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6408   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) == 64 &&
 6409             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6410   match(Set dst (MinV a b));
 6411   match(Set dst (MaxV a b));
 6412   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6413   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6414   ins_encode %{
 6415     assert(UseAVX > 2, "required");
 6416 
 6417     int opcode = this->ideal_Opcode();
 6418     int vlen_enc = vector_length_encoding(this);
 6419     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6420 
 6421     __ evminmax_fp(opcode, elem_bt,
 6422                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6423                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6424   %}
 6425   ins_pipe( pipe_slow );
 6426 %}
 6427 
 6428 // ------------------------------ Unsigned vector Min/Max ----------------------
 6429 
 6430 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6431   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6432   match(Set dst (UMinV a b));
 6433   match(Set dst (UMaxV a b));
 6434   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6435   ins_encode %{
 6436     int opcode = this->ideal_Opcode();
 6437     int vlen_enc = vector_length_encoding(this);
 6438     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6439     assert(is_integral_type(elem_bt), "");
 6440     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6441   %}
 6442   ins_pipe( pipe_slow );
 6443 %}
 6444 
 6445 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6446   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6447   match(Set dst (UMinV a (LoadVector b)));
 6448   match(Set dst (UMaxV a (LoadVector b)));
 6449   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6450   ins_encode %{
 6451     int opcode = this->ideal_Opcode();
 6452     int vlen_enc = vector_length_encoding(this);
 6453     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6454     assert(is_integral_type(elem_bt), "");
 6455     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6456   %}
 6457   ins_pipe( pipe_slow );
 6458 %}
 6459 
 6460 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6461   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6462   match(Set dst (UMinV a b));
 6463   match(Set dst (UMaxV a b));
 6464   effect(TEMP xtmp1, TEMP xtmp2);
 6465   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6466   ins_encode %{
 6467     int opcode = this->ideal_Opcode();
 6468     int vlen_enc = vector_length_encoding(this);
 6469     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6470   %}
 6471   ins_pipe( pipe_slow );
 6472 %}
 6473 
 6474 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6475   match(Set dst (UMinV (Binary dst src2) mask));
 6476   match(Set dst (UMaxV (Binary dst src2) mask));
 6477   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6478   ins_encode %{
 6479     int vlen_enc = vector_length_encoding(this);
 6480     BasicType bt = Matcher::vector_element_basic_type(this);
 6481     int opc = this->ideal_Opcode();
 6482     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6483                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6484   %}
 6485   ins_pipe( pipe_slow );
 6486 %}
 6487 
 6488 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6489   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6490   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6491   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6492   ins_encode %{
 6493     int vlen_enc = vector_length_encoding(this);
 6494     BasicType bt = Matcher::vector_element_basic_type(this);
 6495     int opc = this->ideal_Opcode();
 6496     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6497                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6498   %}
 6499   ins_pipe( pipe_slow );
 6500 %}
 6501 
 6502 // --------------------------------- Signum/CopySign ---------------------------
 6503 
 6504 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6505   match(Set dst (SignumF dst (Binary zero one)));
 6506   effect(KILL cr);
 6507   format %{ "signumF $dst, $dst" %}
 6508   ins_encode %{
 6509     int opcode = this->ideal_Opcode();
 6510     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6511   %}
 6512   ins_pipe( pipe_slow );
 6513 %}
 6514 
 6515 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6516   match(Set dst (SignumD dst (Binary zero one)));
 6517   effect(KILL cr);
 6518   format %{ "signumD $dst, $dst" %}
 6519   ins_encode %{
 6520     int opcode = this->ideal_Opcode();
 6521     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6522   %}
 6523   ins_pipe( pipe_slow );
 6524 %}
 6525 
 6526 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6527   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6528   match(Set dst (SignumVF src (Binary zero one)));
 6529   match(Set dst (SignumVD src (Binary zero one)));
 6530   effect(TEMP dst, TEMP xtmp1);
 6531   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6532   ins_encode %{
 6533     int opcode = this->ideal_Opcode();
 6534     int vec_enc = vector_length_encoding(this);
 6535     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6536                          $xtmp1$$XMMRegister, vec_enc);
 6537   %}
 6538   ins_pipe( pipe_slow );
 6539 %}
 6540 
 6541 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6542   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6543   match(Set dst (SignumVF src (Binary zero one)));
 6544   match(Set dst (SignumVD src (Binary zero one)));
 6545   effect(TEMP dst, TEMP ktmp1);
 6546   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6547   ins_encode %{
 6548     int opcode = this->ideal_Opcode();
 6549     int vec_enc = vector_length_encoding(this);
 6550     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6551                           $ktmp1$$KRegister, vec_enc);
 6552   %}
 6553   ins_pipe( pipe_slow );
 6554 %}
 6555 
 6556 // ---------------------------------------
 6557 // For copySign use 0xE4 as writemask for vpternlog
 6558 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6559 // C (xmm2) is set to 0x7FFFFFFF
 6560 // Wherever xmm2 is 0, we want to pick from B (sign)
 6561 // Wherever xmm2 is 1, we want to pick from A (src)
 6562 //
 6563 // A B C Result
 6564 // 0 0 0 0
 6565 // 0 0 1 0
 6566 // 0 1 0 1
 6567 // 0 1 1 0
 6568 // 1 0 0 0
 6569 // 1 0 1 1
 6570 // 1 1 0 1
 6571 // 1 1 1 1
 6572 //
 6573 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6574 // ---------------------------------------
 6575 
 6576 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6577   match(Set dst (CopySignF dst src));
 6578   effect(TEMP tmp1, TEMP tmp2);
 6579   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6580   ins_encode %{
 6581     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6582     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6583     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6584   %}
 6585   ins_pipe( pipe_slow );
 6586 %}
 6587 
 6588 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6589   match(Set dst (CopySignD dst (Binary src zero)));
 6590   ins_cost(100);
 6591   effect(TEMP tmp1, TEMP tmp2);
 6592   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6593   ins_encode %{
 6594     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6595     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6596     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6597   %}
 6598   ins_pipe( pipe_slow );
 6599 %}
 6600 
 6601 //----------------------------- CompressBits/ExpandBits ------------------------
 6602 
 6603 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6604   predicate(n->bottom_type()->isa_int());
 6605   match(Set dst (CompressBits src mask));
 6606   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6607   ins_encode %{
 6608     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6609   %}
 6610   ins_pipe( pipe_slow );
 6611 %}
 6612 
 6613 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6614   predicate(n->bottom_type()->isa_int());
 6615   match(Set dst (ExpandBits src mask));
 6616   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6617   ins_encode %{
 6618     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6619   %}
 6620   ins_pipe( pipe_slow );
 6621 %}
 6622 
 6623 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6624   predicate(n->bottom_type()->isa_int());
 6625   match(Set dst (CompressBits src (LoadI mask)));
 6626   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6627   ins_encode %{
 6628     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6629   %}
 6630   ins_pipe( pipe_slow );
 6631 %}
 6632 
 6633 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6634   predicate(n->bottom_type()->isa_int());
 6635   match(Set dst (ExpandBits src (LoadI mask)));
 6636   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6637   ins_encode %{
 6638     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6639   %}
 6640   ins_pipe( pipe_slow );
 6641 %}
 6642 
 6643 // --------------------------------- Sqrt --------------------------------------
 6644 
 6645 instruct vsqrtF_reg(vec dst, vec src) %{
 6646   match(Set dst (SqrtVF src));
 6647   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6648   ins_encode %{
 6649     assert(UseAVX > 0, "required");
 6650     int vlen_enc = vector_length_encoding(this);
 6651     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6652   %}
 6653   ins_pipe( pipe_slow );
 6654 %}
 6655 
 6656 instruct vsqrtF_mem(vec dst, memory mem) %{
 6657   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6658   match(Set dst (SqrtVF (LoadVector mem)));
 6659   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6660   ins_encode %{
 6661     assert(UseAVX > 0, "required");
 6662     int vlen_enc = vector_length_encoding(this);
 6663     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6664   %}
 6665   ins_pipe( pipe_slow );
 6666 %}
 6667 
 6668 // Floating point vector sqrt
 6669 instruct vsqrtD_reg(vec dst, vec src) %{
 6670   match(Set dst (SqrtVD src));
 6671   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6672   ins_encode %{
 6673     assert(UseAVX > 0, "required");
 6674     int vlen_enc = vector_length_encoding(this);
 6675     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6676   %}
 6677   ins_pipe( pipe_slow );
 6678 %}
 6679 
 6680 instruct vsqrtD_mem(vec dst, memory mem) %{
 6681   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6682   match(Set dst (SqrtVD (LoadVector mem)));
 6683   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6684   ins_encode %{
 6685     assert(UseAVX > 0, "required");
 6686     int vlen_enc = vector_length_encoding(this);
 6687     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6688   %}
 6689   ins_pipe( pipe_slow );
 6690 %}
 6691 
 6692 // ------------------------------ Shift ---------------------------------------
 6693 
 6694 // Left and right shift count vectors are the same on x86
 6695 // (only lowest bits of xmm reg are used for count).
 6696 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6697   match(Set dst (LShiftCntV cnt));
 6698   match(Set dst (RShiftCntV cnt));
 6699   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6700   ins_encode %{
 6701     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6702   %}
 6703   ins_pipe( pipe_slow );
 6704 %}
 6705 
 6706 // Byte vector shift
 6707 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6708   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6709   match(Set dst ( LShiftVB src shift));
 6710   match(Set dst ( RShiftVB src shift));
 6711   match(Set dst (URShiftVB src shift));
 6712   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6713   format %{"vector_byte_shift $dst,$src,$shift" %}
 6714   ins_encode %{
 6715     assert(UseSSE > 3, "required");
 6716     int opcode = this->ideal_Opcode();
 6717     bool sign = (opcode != Op_URShiftVB);
 6718     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6719     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6720     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6721     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6722     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6723   %}
 6724   ins_pipe( pipe_slow );
 6725 %}
 6726 
 6727 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6728   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6729             UseAVX <= 1);
 6730   match(Set dst ( LShiftVB src shift));
 6731   match(Set dst ( RShiftVB src shift));
 6732   match(Set dst (URShiftVB src shift));
 6733   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6734   format %{"vector_byte_shift $dst,$src,$shift" %}
 6735   ins_encode %{
 6736     assert(UseSSE > 3, "required");
 6737     int opcode = this->ideal_Opcode();
 6738     bool sign = (opcode != Op_URShiftVB);
 6739     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6740     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6741     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6742     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6743     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6744     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6745     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6746     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6747     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6748   %}
 6749   ins_pipe( pipe_slow );
 6750 %}
 6751 
 6752 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6753   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6754             UseAVX > 1);
 6755   match(Set dst ( LShiftVB src shift));
 6756   match(Set dst ( RShiftVB src shift));
 6757   match(Set dst (URShiftVB src shift));
 6758   effect(TEMP dst, TEMP tmp);
 6759   format %{"vector_byte_shift $dst,$src,$shift" %}
 6760   ins_encode %{
 6761     int opcode = this->ideal_Opcode();
 6762     bool sign = (opcode != Op_URShiftVB);
 6763     int vlen_enc = Assembler::AVX_256bit;
 6764     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6765     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6766     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6767     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6768     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6769   %}
 6770   ins_pipe( pipe_slow );
 6771 %}
 6772 
 6773 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6774   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6775   match(Set dst ( LShiftVB src shift));
 6776   match(Set dst ( RShiftVB src shift));
 6777   match(Set dst (URShiftVB src shift));
 6778   effect(TEMP dst, TEMP tmp);
 6779   format %{"vector_byte_shift $dst,$src,$shift" %}
 6780   ins_encode %{
 6781     assert(UseAVX > 1, "required");
 6782     int opcode = this->ideal_Opcode();
 6783     bool sign = (opcode != Op_URShiftVB);
 6784     int vlen_enc = Assembler::AVX_256bit;
 6785     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6786     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6787     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6788     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6789     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6790     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6791     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6792     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6793     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6794   %}
 6795   ins_pipe( pipe_slow );
 6796 %}
 6797 
 6798 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6799   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6800   match(Set dst ( LShiftVB src shift));
 6801   match(Set dst  (RShiftVB src shift));
 6802   match(Set dst (URShiftVB src shift));
 6803   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6804   format %{"vector_byte_shift $dst,$src,$shift" %}
 6805   ins_encode %{
 6806     assert(UseAVX > 2, "required");
 6807     int opcode = this->ideal_Opcode();
 6808     bool sign = (opcode != Op_URShiftVB);
 6809     int vlen_enc = Assembler::AVX_512bit;
 6810     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6811     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6812     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6813     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6814     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6815     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6816     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6817     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6818     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6819     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6820     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6821     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6822   %}
 6823   ins_pipe( pipe_slow );
 6824 %}
 6825 
 6826 // Shorts vector logical right shift produces incorrect Java result
 6827 // for negative data because java code convert short value into int with
 6828 // sign extension before a shift. But char vectors are fine since chars are
 6829 // unsigned values.
 6830 // Shorts/Chars vector left shift
 6831 instruct vshiftS(vec dst, vec src, vec shift) %{
 6832   predicate(!n->as_ShiftV()->is_var_shift());
 6833   match(Set dst ( LShiftVS src shift));
 6834   match(Set dst ( RShiftVS src shift));
 6835   match(Set dst (URShiftVS src shift));
 6836   effect(TEMP dst, USE src, USE shift);
 6837   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6838   ins_encode %{
 6839     int opcode = this->ideal_Opcode();
 6840     if (UseAVX > 0) {
 6841       int vlen_enc = vector_length_encoding(this);
 6842       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6843     } else {
 6844       int vlen = Matcher::vector_length(this);
 6845       if (vlen == 2) {
 6846         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6847         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6848       } else if (vlen == 4) {
 6849         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6850         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6851       } else {
 6852         assert (vlen == 8, "sanity");
 6853         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6854         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6855       }
 6856     }
 6857   %}
 6858   ins_pipe( pipe_slow );
 6859 %}
 6860 
 6861 // Integers vector left shift
 6862 instruct vshiftI(vec dst, vec src, vec shift) %{
 6863   predicate(!n->as_ShiftV()->is_var_shift());
 6864   match(Set dst ( LShiftVI src shift));
 6865   match(Set dst ( RShiftVI src shift));
 6866   match(Set dst (URShiftVI src shift));
 6867   effect(TEMP dst, USE src, USE shift);
 6868   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6869   ins_encode %{
 6870     int opcode = this->ideal_Opcode();
 6871     if (UseAVX > 0) {
 6872       int vlen_enc = vector_length_encoding(this);
 6873       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6874     } else {
 6875       int vlen = Matcher::vector_length(this);
 6876       if (vlen == 2) {
 6877         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6878         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6879       } else {
 6880         assert(vlen == 4, "sanity");
 6881         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6882         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6883       }
 6884     }
 6885   %}
 6886   ins_pipe( pipe_slow );
 6887 %}
 6888 
 6889 // Integers vector left constant shift
 6890 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6891   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6892   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6893   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6894   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6895   ins_encode %{
 6896     int opcode = this->ideal_Opcode();
 6897     if (UseAVX > 0) {
 6898       int vector_len = vector_length_encoding(this);
 6899       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6900     } else {
 6901       int vlen = Matcher::vector_length(this);
 6902       if (vlen == 2) {
 6903         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6904         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6905       } else {
 6906         assert(vlen == 4, "sanity");
 6907         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6908         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6909       }
 6910     }
 6911   %}
 6912   ins_pipe( pipe_slow );
 6913 %}
 6914 
 6915 // Longs vector shift
 6916 instruct vshiftL(vec dst, vec src, vec shift) %{
 6917   predicate(!n->as_ShiftV()->is_var_shift());
 6918   match(Set dst ( LShiftVL src shift));
 6919   match(Set dst (URShiftVL src shift));
 6920   effect(TEMP dst, USE src, USE shift);
 6921   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6922   ins_encode %{
 6923     int opcode = this->ideal_Opcode();
 6924     if (UseAVX > 0) {
 6925       int vlen_enc = vector_length_encoding(this);
 6926       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6927     } else {
 6928       assert(Matcher::vector_length(this) == 2, "");
 6929       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6930       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6931     }
 6932   %}
 6933   ins_pipe( pipe_slow );
 6934 %}
 6935 
 6936 // Longs vector constant shift
 6937 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6938   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6939   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6940   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6941   ins_encode %{
 6942     int opcode = this->ideal_Opcode();
 6943     if (UseAVX > 0) {
 6944       int vector_len = vector_length_encoding(this);
 6945       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6946     } else {
 6947       assert(Matcher::vector_length(this) == 2, "");
 6948       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6949       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6950     }
 6951   %}
 6952   ins_pipe( pipe_slow );
 6953 %}
 6954 
 6955 // -------------------ArithmeticRightShift -----------------------------------
 6956 // Long vector arithmetic right shift
 6957 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6958   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6959   match(Set dst (RShiftVL src shift));
 6960   effect(TEMP dst, TEMP tmp);
 6961   format %{ "vshiftq $dst,$src,$shift" %}
 6962   ins_encode %{
 6963     uint vlen = Matcher::vector_length(this);
 6964     if (vlen == 2) {
 6965       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6966       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6967       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6968       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6969       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6970       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6971     } else {
 6972       assert(vlen == 4, "sanity");
 6973       assert(UseAVX > 1, "required");
 6974       int vlen_enc = Assembler::AVX_256bit;
 6975       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6976       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6977       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6978       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6979       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6980     }
 6981   %}
 6982   ins_pipe( pipe_slow );
 6983 %}
 6984 
 6985 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6986   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6987   match(Set dst (RShiftVL src shift));
 6988   format %{ "vshiftq $dst,$src,$shift" %}
 6989   ins_encode %{
 6990     int vlen_enc = vector_length_encoding(this);
 6991     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6992   %}
 6993   ins_pipe( pipe_slow );
 6994 %}
 6995 
 6996 // ------------------- Variable Shift -----------------------------
 6997 // Byte variable shift
 6998 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6999   predicate(Matcher::vector_length(n) <= 8 &&
 7000             n->as_ShiftV()->is_var_shift() &&
 7001             !VM_Version::supports_avx512bw());
 7002   match(Set dst ( LShiftVB src shift));
 7003   match(Set dst ( RShiftVB src shift));
 7004   match(Set dst (URShiftVB src shift));
 7005   effect(TEMP dst, TEMP vtmp);
 7006   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7007   ins_encode %{
 7008     assert(UseAVX >= 2, "required");
 7009 
 7010     int opcode = this->ideal_Opcode();
 7011     int vlen_enc = Assembler::AVX_128bit;
 7012     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7013     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7014   %}
 7015   ins_pipe( pipe_slow );
 7016 %}
 7017 
 7018 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7019   predicate(Matcher::vector_length(n) == 16 &&
 7020             n->as_ShiftV()->is_var_shift() &&
 7021             !VM_Version::supports_avx512bw());
 7022   match(Set dst ( LShiftVB src shift));
 7023   match(Set dst ( RShiftVB src shift));
 7024   match(Set dst (URShiftVB src shift));
 7025   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7026   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7027   ins_encode %{
 7028     assert(UseAVX >= 2, "required");
 7029 
 7030     int opcode = this->ideal_Opcode();
 7031     int vlen_enc = Assembler::AVX_128bit;
 7032     // Shift lower half and get word result in dst
 7033     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7034 
 7035     // Shift upper half and get word result in vtmp1
 7036     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7037     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7038     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7039 
 7040     // Merge and down convert the two word results to byte in dst
 7041     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7042   %}
 7043   ins_pipe( pipe_slow );
 7044 %}
 7045 
 7046 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7047   predicate(Matcher::vector_length(n) == 32 &&
 7048             n->as_ShiftV()->is_var_shift() &&
 7049             !VM_Version::supports_avx512bw());
 7050   match(Set dst ( LShiftVB src shift));
 7051   match(Set dst ( RShiftVB src shift));
 7052   match(Set dst (URShiftVB src shift));
 7053   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7054   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7055   ins_encode %{
 7056     assert(UseAVX >= 2, "required");
 7057 
 7058     int opcode = this->ideal_Opcode();
 7059     int vlen_enc = Assembler::AVX_128bit;
 7060     // Process lower 128 bits and get result in dst
 7061     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7062     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7063     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7064     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7065     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7066 
 7067     // Process higher 128 bits and get result in vtmp3
 7068     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7069     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7070     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7071     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7072     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7073     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7074     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7075 
 7076     // Merge the two results in dst
 7077     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7078   %}
 7079   ins_pipe( pipe_slow );
 7080 %}
 7081 
 7082 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7083   predicate(Matcher::vector_length(n) <= 32 &&
 7084             n->as_ShiftV()->is_var_shift() &&
 7085             VM_Version::supports_avx512bw());
 7086   match(Set dst ( LShiftVB src shift));
 7087   match(Set dst ( RShiftVB src shift));
 7088   match(Set dst (URShiftVB src shift));
 7089   effect(TEMP dst, TEMP vtmp);
 7090   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7091   ins_encode %{
 7092     assert(UseAVX > 2, "required");
 7093 
 7094     int opcode = this->ideal_Opcode();
 7095     int vlen_enc = vector_length_encoding(this);
 7096     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7097   %}
 7098   ins_pipe( pipe_slow );
 7099 %}
 7100 
 7101 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7102   predicate(Matcher::vector_length(n) == 64 &&
 7103             n->as_ShiftV()->is_var_shift() &&
 7104             VM_Version::supports_avx512bw());
 7105   match(Set dst ( LShiftVB src shift));
 7106   match(Set dst ( RShiftVB src shift));
 7107   match(Set dst (URShiftVB src shift));
 7108   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7109   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7110   ins_encode %{
 7111     assert(UseAVX > 2, "required");
 7112 
 7113     int opcode = this->ideal_Opcode();
 7114     int vlen_enc = Assembler::AVX_256bit;
 7115     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7116     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7117     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7118     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7119     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7120   %}
 7121   ins_pipe( pipe_slow );
 7122 %}
 7123 
 7124 // Short variable shift
 7125 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7126   predicate(Matcher::vector_length(n) <= 8 &&
 7127             n->as_ShiftV()->is_var_shift() &&
 7128             !VM_Version::supports_avx512bw());
 7129   match(Set dst ( LShiftVS src shift));
 7130   match(Set dst ( RShiftVS src shift));
 7131   match(Set dst (URShiftVS src shift));
 7132   effect(TEMP dst, TEMP vtmp);
 7133   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7134   ins_encode %{
 7135     assert(UseAVX >= 2, "required");
 7136 
 7137     int opcode = this->ideal_Opcode();
 7138     bool sign = (opcode != Op_URShiftVS);
 7139     int vlen_enc = Assembler::AVX_256bit;
 7140     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7141     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7142     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7143     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7144     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7145     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7146   %}
 7147   ins_pipe( pipe_slow );
 7148 %}
 7149 
 7150 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7151   predicate(Matcher::vector_length(n) == 16 &&
 7152             n->as_ShiftV()->is_var_shift() &&
 7153             !VM_Version::supports_avx512bw());
 7154   match(Set dst ( LShiftVS src shift));
 7155   match(Set dst ( RShiftVS src shift));
 7156   match(Set dst (URShiftVS src shift));
 7157   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7158   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7159   ins_encode %{
 7160     assert(UseAVX >= 2, "required");
 7161 
 7162     int opcode = this->ideal_Opcode();
 7163     bool sign = (opcode != Op_URShiftVS);
 7164     int vlen_enc = Assembler::AVX_256bit;
 7165     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7166     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7167     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7168     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7169     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7170 
 7171     // Shift upper half, with result in dst using vtmp1 as TEMP
 7172     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7173     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7174     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7175     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7176     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7177     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7178 
 7179     // Merge lower and upper half result into dst
 7180     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7181     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7182   %}
 7183   ins_pipe( pipe_slow );
 7184 %}
 7185 
 7186 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7187   predicate(n->as_ShiftV()->is_var_shift() &&
 7188             VM_Version::supports_avx512bw());
 7189   match(Set dst ( LShiftVS src shift));
 7190   match(Set dst ( RShiftVS src shift));
 7191   match(Set dst (URShiftVS src shift));
 7192   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7193   ins_encode %{
 7194     assert(UseAVX > 2, "required");
 7195 
 7196     int opcode = this->ideal_Opcode();
 7197     int vlen_enc = vector_length_encoding(this);
 7198     if (!VM_Version::supports_avx512vl()) {
 7199       vlen_enc = Assembler::AVX_512bit;
 7200     }
 7201     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7202   %}
 7203   ins_pipe( pipe_slow );
 7204 %}
 7205 
 7206 //Integer variable shift
 7207 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7208   predicate(n->as_ShiftV()->is_var_shift());
 7209   match(Set dst ( LShiftVI src shift));
 7210   match(Set dst ( RShiftVI src shift));
 7211   match(Set dst (URShiftVI src shift));
 7212   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7213   ins_encode %{
 7214     assert(UseAVX >= 2, "required");
 7215 
 7216     int opcode = this->ideal_Opcode();
 7217     int vlen_enc = vector_length_encoding(this);
 7218     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7219   %}
 7220   ins_pipe( pipe_slow );
 7221 %}
 7222 
 7223 //Long variable shift
 7224 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7225   predicate(n->as_ShiftV()->is_var_shift());
 7226   match(Set dst ( LShiftVL src shift));
 7227   match(Set dst (URShiftVL src shift));
 7228   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7229   ins_encode %{
 7230     assert(UseAVX >= 2, "required");
 7231 
 7232     int opcode = this->ideal_Opcode();
 7233     int vlen_enc = vector_length_encoding(this);
 7234     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7235   %}
 7236   ins_pipe( pipe_slow );
 7237 %}
 7238 
 7239 //Long variable right shift arithmetic
 7240 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7241   predicate(Matcher::vector_length(n) <= 4 &&
 7242             n->as_ShiftV()->is_var_shift() &&
 7243             UseAVX == 2);
 7244   match(Set dst (RShiftVL src shift));
 7245   effect(TEMP dst, TEMP vtmp);
 7246   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7247   ins_encode %{
 7248     int opcode = this->ideal_Opcode();
 7249     int vlen_enc = vector_length_encoding(this);
 7250     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7251                  $vtmp$$XMMRegister);
 7252   %}
 7253   ins_pipe( pipe_slow );
 7254 %}
 7255 
 7256 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7257   predicate(n->as_ShiftV()->is_var_shift() &&
 7258             UseAVX > 2);
 7259   match(Set dst (RShiftVL src shift));
 7260   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7261   ins_encode %{
 7262     int opcode = this->ideal_Opcode();
 7263     int vlen_enc = vector_length_encoding(this);
 7264     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7265   %}
 7266   ins_pipe( pipe_slow );
 7267 %}
 7268 
 7269 // --------------------------------- AND --------------------------------------
 7270 
 7271 instruct vand(vec dst, vec src) %{
 7272   predicate(UseAVX == 0);
 7273   match(Set dst (AndV dst src));
 7274   format %{ "pand    $dst,$src\t! and vectors" %}
 7275   ins_encode %{
 7276     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7277   %}
 7278   ins_pipe( pipe_slow );
 7279 %}
 7280 
 7281 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7282   predicate(UseAVX > 0);
 7283   match(Set dst (AndV src1 src2));
 7284   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7285   ins_encode %{
 7286     int vlen_enc = vector_length_encoding(this);
 7287     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7288   %}
 7289   ins_pipe( pipe_slow );
 7290 %}
 7291 
 7292 instruct vand_mem(vec dst, vec src, memory mem) %{
 7293   predicate((UseAVX > 0) &&
 7294             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7295   match(Set dst (AndV src (LoadVector mem)));
 7296   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7297   ins_encode %{
 7298     int vlen_enc = vector_length_encoding(this);
 7299     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7300   %}
 7301   ins_pipe( pipe_slow );
 7302 %}
 7303 
 7304 // --------------------------------- OR ---------------------------------------
 7305 
 7306 instruct vor(vec dst, vec src) %{
 7307   predicate(UseAVX == 0);
 7308   match(Set dst (OrV dst src));
 7309   format %{ "por     $dst,$src\t! or vectors" %}
 7310   ins_encode %{
 7311     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7312   %}
 7313   ins_pipe( pipe_slow );
 7314 %}
 7315 
 7316 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7317   predicate(UseAVX > 0);
 7318   match(Set dst (OrV src1 src2));
 7319   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7320   ins_encode %{
 7321     int vlen_enc = vector_length_encoding(this);
 7322     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7323   %}
 7324   ins_pipe( pipe_slow );
 7325 %}
 7326 
 7327 instruct vor_mem(vec dst, vec src, memory mem) %{
 7328   predicate((UseAVX > 0) &&
 7329             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7330   match(Set dst (OrV src (LoadVector mem)));
 7331   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7332   ins_encode %{
 7333     int vlen_enc = vector_length_encoding(this);
 7334     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7335   %}
 7336   ins_pipe( pipe_slow );
 7337 %}
 7338 
 7339 // --------------------------------- XOR --------------------------------------
 7340 
 7341 instruct vxor(vec dst, vec src) %{
 7342   predicate(UseAVX == 0);
 7343   match(Set dst (XorV dst src));
 7344   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7345   ins_encode %{
 7346     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7347   %}
 7348   ins_pipe( pipe_slow );
 7349 %}
 7350 
 7351 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7352   predicate(UseAVX > 0);
 7353   match(Set dst (XorV src1 src2));
 7354   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7355   ins_encode %{
 7356     int vlen_enc = vector_length_encoding(this);
 7357     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7358   %}
 7359   ins_pipe( pipe_slow );
 7360 %}
 7361 
 7362 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7363   predicate((UseAVX > 0) &&
 7364             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7365   match(Set dst (XorV src (LoadVector mem)));
 7366   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7367   ins_encode %{
 7368     int vlen_enc = vector_length_encoding(this);
 7369     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7370   %}
 7371   ins_pipe( pipe_slow );
 7372 %}
 7373 
 7374 // --------------------------------- VectorCast --------------------------------------
 7375 
 7376 instruct vcastBtoX(vec dst, vec src) %{
 7377   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7378   match(Set dst (VectorCastB2X src));
 7379   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7380   ins_encode %{
 7381     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7382     int vlen_enc = vector_length_encoding(this);
 7383     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7384   %}
 7385   ins_pipe( pipe_slow );
 7386 %}
 7387 
 7388 instruct vcastBtoD(legVec dst, legVec src) %{
 7389   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7390   match(Set dst (VectorCastB2X src));
 7391   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7392   ins_encode %{
 7393     int vlen_enc = vector_length_encoding(this);
 7394     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7395   %}
 7396   ins_pipe( pipe_slow );
 7397 %}
 7398 
 7399 instruct castStoX(vec dst, vec src) %{
 7400   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7401             Matcher::vector_length(n->in(1)) <= 8 && // src
 7402             Matcher::vector_element_basic_type(n) == T_BYTE);
 7403   match(Set dst (VectorCastS2X src));
 7404   format %{ "vector_cast_s2x $dst,$src" %}
 7405   ins_encode %{
 7406     assert(UseAVX > 0, "required");
 7407 
 7408     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7409     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7410   %}
 7411   ins_pipe( pipe_slow );
 7412 %}
 7413 
 7414 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7415   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7416             Matcher::vector_length(n->in(1)) == 16 && // src
 7417             Matcher::vector_element_basic_type(n) == T_BYTE);
 7418   effect(TEMP dst, TEMP vtmp);
 7419   match(Set dst (VectorCastS2X src));
 7420   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7421   ins_encode %{
 7422     assert(UseAVX > 0, "required");
 7423 
 7424     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7425     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7426     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7427     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7428   %}
 7429   ins_pipe( pipe_slow );
 7430 %}
 7431 
 7432 instruct vcastStoX_evex(vec dst, vec src) %{
 7433   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7434             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7435   match(Set dst (VectorCastS2X src));
 7436   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7437   ins_encode %{
 7438     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7439     int src_vlen_enc = vector_length_encoding(this, $src);
 7440     int vlen_enc = vector_length_encoding(this);
 7441     switch (to_elem_bt) {
 7442       case T_BYTE:
 7443         if (!VM_Version::supports_avx512vl()) {
 7444           vlen_enc = Assembler::AVX_512bit;
 7445         }
 7446         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7447         break;
 7448       case T_INT:
 7449         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7450         break;
 7451       case T_FLOAT:
 7452         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7453         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7454         break;
 7455       case T_LONG:
 7456         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7457         break;
 7458       case T_DOUBLE: {
 7459         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7460         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7461         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7462         break;
 7463       }
 7464       default:
 7465         ShouldNotReachHere();
 7466     }
 7467   %}
 7468   ins_pipe( pipe_slow );
 7469 %}
 7470 
 7471 instruct castItoX(vec dst, vec src) %{
 7472   predicate(UseAVX <= 2 &&
 7473             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7474             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7475   match(Set dst (VectorCastI2X src));
 7476   format %{ "vector_cast_i2x $dst,$src" %}
 7477   ins_encode %{
 7478     assert(UseAVX > 0, "required");
 7479 
 7480     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7481     int vlen_enc = vector_length_encoding(this, $src);
 7482 
 7483     if (to_elem_bt == T_BYTE) {
 7484       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7485       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7486       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7487     } else {
 7488       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7489       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7490       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7491     }
 7492   %}
 7493   ins_pipe( pipe_slow );
 7494 %}
 7495 
 7496 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7497   predicate(UseAVX <= 2 &&
 7498             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7499             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7500   match(Set dst (VectorCastI2X src));
 7501   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7502   effect(TEMP dst, TEMP vtmp);
 7503   ins_encode %{
 7504     assert(UseAVX > 0, "required");
 7505 
 7506     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7507     int vlen_enc = vector_length_encoding(this, $src);
 7508 
 7509     if (to_elem_bt == T_BYTE) {
 7510       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7511       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7512       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7513       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7514     } else {
 7515       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7516       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7517       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7518       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7519     }
 7520   %}
 7521   ins_pipe( pipe_slow );
 7522 %}
 7523 
 7524 instruct vcastItoX_evex(vec dst, vec src) %{
 7525   predicate(UseAVX > 2 ||
 7526             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7527   match(Set dst (VectorCastI2X src));
 7528   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7529   ins_encode %{
 7530     assert(UseAVX > 0, "required");
 7531 
 7532     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7533     int src_vlen_enc = vector_length_encoding(this, $src);
 7534     int dst_vlen_enc = vector_length_encoding(this);
 7535     switch (dst_elem_bt) {
 7536       case T_BYTE:
 7537         if (!VM_Version::supports_avx512vl()) {
 7538           src_vlen_enc = Assembler::AVX_512bit;
 7539         }
 7540         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7541         break;
 7542       case T_SHORT:
 7543         if (!VM_Version::supports_avx512vl()) {
 7544           src_vlen_enc = Assembler::AVX_512bit;
 7545         }
 7546         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7547         break;
 7548       case T_FLOAT:
 7549         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7550         break;
 7551       case T_LONG:
 7552         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7553         break;
 7554       case T_DOUBLE:
 7555         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7556         break;
 7557       default:
 7558         ShouldNotReachHere();
 7559     }
 7560   %}
 7561   ins_pipe( pipe_slow );
 7562 %}
 7563 
 7564 instruct vcastLtoBS(vec dst, vec src) %{
 7565   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7566             UseAVX <= 2);
 7567   match(Set dst (VectorCastL2X src));
 7568   format %{ "vector_cast_l2x  $dst,$src" %}
 7569   ins_encode %{
 7570     assert(UseAVX > 0, "required");
 7571 
 7572     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7573     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7574     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7575                                                       : ExternalAddress(vector_int_to_short_mask());
 7576     if (vlen <= 16) {
 7577       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7578       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7579       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7580     } else {
 7581       assert(vlen <= 32, "required");
 7582       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7583       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7584       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7585       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7586     }
 7587     if (to_elem_bt == T_BYTE) {
 7588       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7589     }
 7590   %}
 7591   ins_pipe( pipe_slow );
 7592 %}
 7593 
 7594 instruct vcastLtoX_evex(vec dst, vec src) %{
 7595   predicate(UseAVX > 2 ||
 7596             (Matcher::vector_element_basic_type(n) == T_INT ||
 7597              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7598              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7599   match(Set dst (VectorCastL2X src));
 7600   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7601   ins_encode %{
 7602     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7603     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7604     int vlen_enc = vector_length_encoding(this, $src);
 7605     switch (to_elem_bt) {
 7606       case T_BYTE:
 7607         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7608           vlen_enc = Assembler::AVX_512bit;
 7609         }
 7610         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7611         break;
 7612       case T_SHORT:
 7613         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7614           vlen_enc = Assembler::AVX_512bit;
 7615         }
 7616         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7617         break;
 7618       case T_INT:
 7619         if (vlen == 8) {
 7620           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7621             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7622           }
 7623         } else if (vlen == 16) {
 7624           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7625         } else if (vlen == 32) {
 7626           if (UseAVX > 2) {
 7627             if (!VM_Version::supports_avx512vl()) {
 7628               vlen_enc = Assembler::AVX_512bit;
 7629             }
 7630             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7631           } else {
 7632             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7633             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7634           }
 7635         } else { // vlen == 64
 7636           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7637         }
 7638         break;
 7639       case T_FLOAT:
 7640         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7641         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7642         break;
 7643       case T_DOUBLE:
 7644         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7645         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7646         break;
 7647 
 7648       default: assert(false, "%s", type2name(to_elem_bt));
 7649     }
 7650   %}
 7651   ins_pipe( pipe_slow );
 7652 %}
 7653 
 7654 instruct vcastFtoD_reg(vec dst, vec src) %{
 7655   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7656   match(Set dst (VectorCastF2X src));
 7657   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7658   ins_encode %{
 7659     int vlen_enc = vector_length_encoding(this);
 7660     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7661   %}
 7662   ins_pipe( pipe_slow );
 7663 %}
 7664 
 7665 
 7666 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7667   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7668             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7669   match(Set dst (VectorCastF2X src));
 7670   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7671   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7672   ins_encode %{
 7673     int vlen_enc = vector_length_encoding(this, $src);
 7674     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7675     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7676     // 32 bit addresses for register indirect addressing mode since stub constants
 7677     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7678     // However, targets are free to increase this limit, but having a large code cache size
 7679     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7680     // cap we save a temporary register allocation which in limiting case can prevent
 7681     // spilling in high register pressure blocks.
 7682     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7683                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7684                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7685   %}
 7686   ins_pipe( pipe_slow );
 7687 %}
 7688 
 7689 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7690   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7691             is_integral_type(Matcher::vector_element_basic_type(n)));
 7692   match(Set dst (VectorCastF2X src));
 7693   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7694   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7695   ins_encode %{
 7696     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7697     if (to_elem_bt == T_LONG) {
 7698       int vlen_enc = vector_length_encoding(this);
 7699       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7700                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7701                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7702     } else {
 7703       int vlen_enc = vector_length_encoding(this, $src);
 7704       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7705                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7706                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7707     }
 7708   %}
 7709   ins_pipe( pipe_slow );
 7710 %}
 7711 
 7712 instruct vcastDtoF_reg(vec dst, vec src) %{
 7713   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7714   match(Set dst (VectorCastD2X src));
 7715   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7716   ins_encode %{
 7717     int vlen_enc = vector_length_encoding(this, $src);
 7718     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7719   %}
 7720   ins_pipe( pipe_slow );
 7721 %}
 7722 
 7723 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7724   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7725             is_integral_type(Matcher::vector_element_basic_type(n)));
 7726   match(Set dst (VectorCastD2X src));
 7727   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7728   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7729   ins_encode %{
 7730     int vlen_enc = vector_length_encoding(this, $src);
 7731     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7732     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7733                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7734                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7735   %}
 7736   ins_pipe( pipe_slow );
 7737 %}
 7738 
 7739 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7740   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7741             is_integral_type(Matcher::vector_element_basic_type(n)));
 7742   match(Set dst (VectorCastD2X src));
 7743   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7744   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7745   ins_encode %{
 7746     int vlen_enc = vector_length_encoding(this, $src);
 7747     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7748     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7749                               ExternalAddress(vector_float_signflip());
 7750     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7751                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7752   %}
 7753   ins_pipe( pipe_slow );
 7754 %}
 7755 
 7756 instruct vucast(vec dst, vec src) %{
 7757   match(Set dst (VectorUCastB2X src));
 7758   match(Set dst (VectorUCastS2X src));
 7759   match(Set dst (VectorUCastI2X src));
 7760   format %{ "vector_ucast $dst,$src\t!" %}
 7761   ins_encode %{
 7762     assert(UseAVX > 0, "required");
 7763 
 7764     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7765     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7766     int vlen_enc = vector_length_encoding(this);
 7767     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7768   %}
 7769   ins_pipe( pipe_slow );
 7770 %}
 7771 
 7772 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7773   predicate(!VM_Version::supports_avx512vl() &&
 7774             Matcher::vector_length_in_bytes(n) < 64 &&
 7775             Matcher::vector_element_basic_type(n) == T_INT);
 7776   match(Set dst (RoundVF src));
 7777   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7778   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7779   ins_encode %{
 7780     int vlen_enc = vector_length_encoding(this);
 7781     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7782     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7783                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7784                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7785   %}
 7786   ins_pipe( pipe_slow );
 7787 %}
 7788 
 7789 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7790   predicate((VM_Version::supports_avx512vl() ||
 7791              Matcher::vector_length_in_bytes(n) == 64) &&
 7792              Matcher::vector_element_basic_type(n) == T_INT);
 7793   match(Set dst (RoundVF src));
 7794   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7795   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7796   ins_encode %{
 7797     int vlen_enc = vector_length_encoding(this);
 7798     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7799     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7800                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7801                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7802   %}
 7803   ins_pipe( pipe_slow );
 7804 %}
 7805 
 7806 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7807   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7808   match(Set dst (RoundVD src));
 7809   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7810   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7811   ins_encode %{
 7812     int vlen_enc = vector_length_encoding(this);
 7813     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7814     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7815                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7816                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7817   %}
 7818   ins_pipe( pipe_slow );
 7819 %}
 7820 
 7821 // --------------------------------- VectorMaskCmp --------------------------------------
 7822 
 7823 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7824   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7825             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7826             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7827             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7828   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7829   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7830   ins_encode %{
 7831     int vlen_enc = vector_length_encoding(this, $src1);
 7832     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7833     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7834       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7835     } else {
 7836       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7837     }
 7838   %}
 7839   ins_pipe( pipe_slow );
 7840 %}
 7841 
 7842 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7843   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7844             n->bottom_type()->isa_vectmask() == nullptr &&
 7845             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7846   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7847   effect(TEMP ktmp);
 7848   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7849   ins_encode %{
 7850     int vlen_enc = Assembler::AVX_512bit;
 7851     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7852     KRegister mask = k0; // The comparison itself is not being masked.
 7853     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7854       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7855       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7856     } else {
 7857       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7858       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7859     }
 7860   %}
 7861   ins_pipe( pipe_slow );
 7862 %}
 7863 
 7864 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7865   predicate(n->bottom_type()->isa_vectmask() &&
 7866             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7867   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7868   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7869   ins_encode %{
 7870     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7871     int vlen_enc = vector_length_encoding(this, $src1);
 7872     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7873     KRegister mask = k0; // The comparison itself is not being masked.
 7874     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7875       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7876     } else {
 7877       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7878     }
 7879   %}
 7880   ins_pipe( pipe_slow );
 7881 %}
 7882 
 7883 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7884   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7885             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7886             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7887             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7888             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7889             (n->in(2)->get_int() == BoolTest::eq ||
 7890              n->in(2)->get_int() == BoolTest::lt ||
 7891              n->in(2)->get_int() == BoolTest::gt)); // cond
 7892   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7893   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7894   ins_encode %{
 7895     int vlen_enc = vector_length_encoding(this, $src1);
 7896     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7897     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7898     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7899   %}
 7900   ins_pipe( pipe_slow );
 7901 %}
 7902 
 7903 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7904   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7905             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7906             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7907             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7908             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7909             (n->in(2)->get_int() == BoolTest::ne ||
 7910              n->in(2)->get_int() == BoolTest::le ||
 7911              n->in(2)->get_int() == BoolTest::ge)); // cond
 7912   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7913   effect(TEMP dst, TEMP xtmp);
 7914   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7915   ins_encode %{
 7916     int vlen_enc = vector_length_encoding(this, $src1);
 7917     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7918     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7919     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7920   %}
 7921   ins_pipe( pipe_slow );
 7922 %}
 7923 
 7924 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7925   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7926             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7927             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7928             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7929             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7930   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7931   effect(TEMP dst, TEMP xtmp);
 7932   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7933   ins_encode %{
 7934     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7935     int vlen_enc = vector_length_encoding(this, $src1);
 7936     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7937     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7938 
 7939     if (vlen_enc == Assembler::AVX_128bit) {
 7940       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7941     } else {
 7942       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7943     }
 7944     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7945     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7946     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7947   %}
 7948   ins_pipe( pipe_slow );
 7949 %}
 7950 
 7951 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7952   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7953              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7954              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7955   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7956   effect(TEMP ktmp);
 7957   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7958   ins_encode %{
 7959     assert(UseAVX > 2, "required");
 7960 
 7961     int vlen_enc = vector_length_encoding(this, $src1);
 7962     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7963     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7964     KRegister mask = k0; // The comparison itself is not being masked.
 7965     bool merge = false;
 7966     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7967 
 7968     switch (src1_elem_bt) {
 7969       case T_INT: {
 7970         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7971         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7972         break;
 7973       }
 7974       case T_LONG: {
 7975         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7976         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7977         break;
 7978       }
 7979       default: assert(false, "%s", type2name(src1_elem_bt));
 7980     }
 7981   %}
 7982   ins_pipe( pipe_slow );
 7983 %}
 7984 
 7985 
 7986 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7987   predicate(n->bottom_type()->isa_vectmask() &&
 7988             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7989   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7990   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7991   ins_encode %{
 7992     assert(UseAVX > 2, "required");
 7993     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7994 
 7995     int vlen_enc = vector_length_encoding(this, $src1);
 7996     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7997     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7998     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7999 
 8000     // Comparison i
 8001     switch (src1_elem_bt) {
 8002       case T_BYTE: {
 8003         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8004         break;
 8005       }
 8006       case T_SHORT: {
 8007         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8008         break;
 8009       }
 8010       case T_INT: {
 8011         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8012         break;
 8013       }
 8014       case T_LONG: {
 8015         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8016         break;
 8017       }
 8018       default: assert(false, "%s", type2name(src1_elem_bt));
 8019     }
 8020   %}
 8021   ins_pipe( pipe_slow );
 8022 %}
 8023 
 8024 // Extract
 8025 
 8026 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8027   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8028   match(Set dst (ExtractI src idx));
 8029   match(Set dst (ExtractS src idx));
 8030   match(Set dst (ExtractB src idx));
 8031   format %{ "extractI $dst,$src,$idx\t!" %}
 8032   ins_encode %{
 8033     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8034 
 8035     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8036     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8037   %}
 8038   ins_pipe( pipe_slow );
 8039 %}
 8040 
 8041 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8042   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8043             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8044   match(Set dst (ExtractI src idx));
 8045   match(Set dst (ExtractS src idx));
 8046   match(Set dst (ExtractB src idx));
 8047   effect(TEMP vtmp);
 8048   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8049   ins_encode %{
 8050     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8051 
 8052     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8053     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8054     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8055   %}
 8056   ins_pipe( pipe_slow );
 8057 %}
 8058 
 8059 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8060   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8061   match(Set dst (ExtractL src idx));
 8062   format %{ "extractL $dst,$src,$idx\t!" %}
 8063   ins_encode %{
 8064     assert(UseSSE >= 4, "required");
 8065     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8066 
 8067     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8068   %}
 8069   ins_pipe( pipe_slow );
 8070 %}
 8071 
 8072 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8073   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8074             Matcher::vector_length(n->in(1)) == 8);  // src
 8075   match(Set dst (ExtractL src idx));
 8076   effect(TEMP vtmp);
 8077   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8078   ins_encode %{
 8079     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8080 
 8081     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8082     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8083   %}
 8084   ins_pipe( pipe_slow );
 8085 %}
 8086 
 8087 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8088   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8089   match(Set dst (ExtractF src idx));
 8090   effect(TEMP dst, TEMP vtmp);
 8091   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8092   ins_encode %{
 8093     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8094 
 8095     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8096   %}
 8097   ins_pipe( pipe_slow );
 8098 %}
 8099 
 8100 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8101   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8102             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8103   match(Set dst (ExtractF src idx));
 8104   effect(TEMP vtmp);
 8105   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8106   ins_encode %{
 8107     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8108 
 8109     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8110     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8111   %}
 8112   ins_pipe( pipe_slow );
 8113 %}
 8114 
 8115 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8116   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8117   match(Set dst (ExtractD src idx));
 8118   format %{ "extractD $dst,$src,$idx\t!" %}
 8119   ins_encode %{
 8120     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8121 
 8122     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8123   %}
 8124   ins_pipe( pipe_slow );
 8125 %}
 8126 
 8127 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8128   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8129             Matcher::vector_length(n->in(1)) == 8);  // src
 8130   match(Set dst (ExtractD src idx));
 8131   effect(TEMP vtmp);
 8132   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8133   ins_encode %{
 8134     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8135 
 8136     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8137     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8138   %}
 8139   ins_pipe( pipe_slow );
 8140 %}
 8141 
 8142 // --------------------------------- Vector Blend --------------------------------------
 8143 
 8144 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8145   predicate(UseAVX == 0);
 8146   match(Set dst (VectorBlend (Binary dst src) mask));
 8147   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8148   effect(TEMP tmp);
 8149   ins_encode %{
 8150     assert(UseSSE >= 4, "required");
 8151 
 8152     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8153       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8154     }
 8155     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8156   %}
 8157   ins_pipe( pipe_slow );
 8158 %}
 8159 
 8160 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8161   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8162             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8163             Matcher::vector_length_in_bytes(n) <= 32 &&
 8164             is_integral_type(Matcher::vector_element_basic_type(n)));
 8165   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8166   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8167   ins_encode %{
 8168     int vlen_enc = vector_length_encoding(this);
 8169     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8170   %}
 8171   ins_pipe( pipe_slow );
 8172 %}
 8173 
 8174 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8175   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8176             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8177             Matcher::vector_length_in_bytes(n) <= 32 &&
 8178             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8179   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8180   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8181   ins_encode %{
 8182     int vlen_enc = vector_length_encoding(this);
 8183     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8184   %}
 8185   ins_pipe( pipe_slow );
 8186 %}
 8187 
 8188 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8189   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8190             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8191             Matcher::vector_length_in_bytes(n) <= 32);
 8192   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8193   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8194   effect(TEMP vtmp, TEMP dst);
 8195   ins_encode %{
 8196     int vlen_enc = vector_length_encoding(this);
 8197     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8198     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8199     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8200   %}
 8201   ins_pipe( pipe_slow );
 8202 %}
 8203 
 8204 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8205   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8206             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8207   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8208   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8209   effect(TEMP ktmp);
 8210   ins_encode %{
 8211      int vlen_enc = Assembler::AVX_512bit;
 8212      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8213     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8214     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8215   %}
 8216   ins_pipe( pipe_slow );
 8217 %}
 8218 
 8219 
 8220 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8221   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8222             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8223              VM_Version::supports_avx512bw()));
 8224   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8225   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8226   ins_encode %{
 8227     int vlen_enc = vector_length_encoding(this);
 8228     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8229     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8230   %}
 8231   ins_pipe( pipe_slow );
 8232 %}
 8233 
 8234 // --------------------------------- ABS --------------------------------------
 8235 // a = |a|
 8236 instruct vabsB_reg(vec dst, vec src) %{
 8237   match(Set dst (AbsVB  src));
 8238   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8239   ins_encode %{
 8240     uint vlen = Matcher::vector_length(this);
 8241     if (vlen <= 16) {
 8242       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8243     } else {
 8244       int vlen_enc = vector_length_encoding(this);
 8245       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8246     }
 8247   %}
 8248   ins_pipe( pipe_slow );
 8249 %}
 8250 
 8251 instruct vabsS_reg(vec dst, vec src) %{
 8252   match(Set dst (AbsVS  src));
 8253   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8254   ins_encode %{
 8255     uint vlen = Matcher::vector_length(this);
 8256     if (vlen <= 8) {
 8257       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8258     } else {
 8259       int vlen_enc = vector_length_encoding(this);
 8260       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8261     }
 8262   %}
 8263   ins_pipe( pipe_slow );
 8264 %}
 8265 
 8266 instruct vabsI_reg(vec dst, vec src) %{
 8267   match(Set dst (AbsVI  src));
 8268   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8269   ins_encode %{
 8270     uint vlen = Matcher::vector_length(this);
 8271     if (vlen <= 4) {
 8272       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8273     } else {
 8274       int vlen_enc = vector_length_encoding(this);
 8275       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8276     }
 8277   %}
 8278   ins_pipe( pipe_slow );
 8279 %}
 8280 
 8281 instruct vabsL_reg(vec dst, vec src) %{
 8282   match(Set dst (AbsVL  src));
 8283   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8284   ins_encode %{
 8285     assert(UseAVX > 2, "required");
 8286     int vlen_enc = vector_length_encoding(this);
 8287     if (!VM_Version::supports_avx512vl()) {
 8288       vlen_enc = Assembler::AVX_512bit;
 8289     }
 8290     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8291   %}
 8292   ins_pipe( pipe_slow );
 8293 %}
 8294 
 8295 // --------------------------------- ABSNEG --------------------------------------
 8296 
 8297 instruct vabsnegF(vec dst, vec src) %{
 8298   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8299   match(Set dst (AbsVF src));
 8300   match(Set dst (NegVF src));
 8301   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8302   ins_cost(150);
 8303   ins_encode %{
 8304     int opcode = this->ideal_Opcode();
 8305     int vlen = Matcher::vector_length(this);
 8306     if (vlen == 2) {
 8307       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8308     } else {
 8309       assert(vlen == 8 || vlen == 16, "required");
 8310       int vlen_enc = vector_length_encoding(this);
 8311       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8312     }
 8313   %}
 8314   ins_pipe( pipe_slow );
 8315 %}
 8316 
 8317 instruct vabsneg4F(vec dst) %{
 8318   predicate(Matcher::vector_length(n) == 4);
 8319   match(Set dst (AbsVF dst));
 8320   match(Set dst (NegVF dst));
 8321   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8322   ins_cost(150);
 8323   ins_encode %{
 8324     int opcode = this->ideal_Opcode();
 8325     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8326   %}
 8327   ins_pipe( pipe_slow );
 8328 %}
 8329 
 8330 instruct vabsnegD(vec dst, vec src) %{
 8331   match(Set dst (AbsVD  src));
 8332   match(Set dst (NegVD  src));
 8333   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8334   ins_encode %{
 8335     int opcode = this->ideal_Opcode();
 8336     uint vlen = Matcher::vector_length(this);
 8337     if (vlen == 2) {
 8338       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8339     } else {
 8340       int vlen_enc = vector_length_encoding(this);
 8341       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8342     }
 8343   %}
 8344   ins_pipe( pipe_slow );
 8345 %}
 8346 
 8347 //------------------------------------- VectorTest --------------------------------------------
 8348 
 8349 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8350   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8351   match(Set cr (VectorTest src1 src2));
 8352   effect(TEMP vtmp);
 8353   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8354   ins_encode %{
 8355     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8356     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8357     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8358   %}
 8359   ins_pipe( pipe_slow );
 8360 %}
 8361 
 8362 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8363   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8364   match(Set cr (VectorTest src1 src2));
 8365   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8366   ins_encode %{
 8367     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8368     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8369     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8370   %}
 8371   ins_pipe( pipe_slow );
 8372 %}
 8373 
 8374 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8375   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8376              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8377             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8378   match(Set cr (VectorTest src1 src2));
 8379   effect(TEMP tmp);
 8380   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8381   ins_encode %{
 8382     uint masklen = Matcher::vector_length(this, $src1);
 8383     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8384     __ andl($tmp$$Register, (1 << masklen) - 1);
 8385     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8386   %}
 8387   ins_pipe( pipe_slow );
 8388 %}
 8389 
 8390 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8391   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8392              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8393             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8394   match(Set cr (VectorTest src1 src2));
 8395   effect(TEMP tmp);
 8396   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8397   ins_encode %{
 8398     uint masklen = Matcher::vector_length(this, $src1);
 8399     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8400     __ andl($tmp$$Register, (1 << masklen) - 1);
 8401   %}
 8402   ins_pipe( pipe_slow );
 8403 %}
 8404 
 8405 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8406   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8407             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8408   match(Set cr (VectorTest src1 src2));
 8409   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8410   ins_encode %{
 8411     uint masklen = Matcher::vector_length(this, $src1);
 8412     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8413   %}
 8414   ins_pipe( pipe_slow );
 8415 %}
 8416 
 8417 //------------------------------------- LoadMask --------------------------------------------
 8418 
 8419 instruct loadMask(legVec dst, legVec src) %{
 8420   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8421   match(Set dst (VectorLoadMask src));
 8422   effect(TEMP dst);
 8423   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8424   ins_encode %{
 8425     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8426     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8427     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8428   %}
 8429   ins_pipe( pipe_slow );
 8430 %}
 8431 
 8432 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8433   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8434   match(Set dst (VectorLoadMask src));
 8435   effect(TEMP xtmp);
 8436   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8437   ins_encode %{
 8438     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8439                         true, Assembler::AVX_512bit);
 8440   %}
 8441   ins_pipe( pipe_slow );
 8442 %}
 8443 
 8444 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8445   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8446   match(Set dst (VectorLoadMask src));
 8447   effect(TEMP xtmp);
 8448   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8449   ins_encode %{
 8450     int vlen_enc = vector_length_encoding(in(1));
 8451     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8452                         false, vlen_enc);
 8453   %}
 8454   ins_pipe( pipe_slow );
 8455 %}
 8456 
 8457 //------------------------------------- StoreMask --------------------------------------------
 8458 
 8459 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8460   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8461   match(Set dst (VectorStoreMask src size));
 8462   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8463   ins_encode %{
 8464     int vlen = Matcher::vector_length(this);
 8465     if (vlen <= 16 && UseAVX <= 2) {
 8466       assert(UseSSE >= 3, "required");
 8467       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8468     } else {
 8469       assert(UseAVX > 0, "required");
 8470       int src_vlen_enc = vector_length_encoding(this, $src);
 8471       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8472     }
 8473   %}
 8474   ins_pipe( pipe_slow );
 8475 %}
 8476 
 8477 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8478   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8479   match(Set dst (VectorStoreMask src size));
 8480   effect(TEMP_DEF dst, TEMP xtmp);
 8481   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8482   ins_encode %{
 8483     int vlen_enc = Assembler::AVX_128bit;
 8484     int vlen = Matcher::vector_length(this);
 8485     if (vlen <= 8) {
 8486       assert(UseSSE >= 3, "required");
 8487       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8488       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8489       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8490     } else {
 8491       assert(UseAVX > 0, "required");
 8492       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8493       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8494       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8495     }
 8496   %}
 8497   ins_pipe( pipe_slow );
 8498 %}
 8499 
 8500 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8501   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8502   match(Set dst (VectorStoreMask src size));
 8503   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8504   effect(TEMP_DEF dst, TEMP xtmp);
 8505   ins_encode %{
 8506     int vlen_enc = Assembler::AVX_128bit;
 8507     int vlen = Matcher::vector_length(this);
 8508     if (vlen <= 4) {
 8509       assert(UseSSE >= 3, "required");
 8510       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8511       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8512       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8513       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8514     } else {
 8515       assert(UseAVX > 0, "required");
 8516       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8517       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8518       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8519       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8520       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8521     }
 8522   %}
 8523   ins_pipe( pipe_slow );
 8524 %}
 8525 
 8526 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8527   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8528   match(Set dst (VectorStoreMask src size));
 8529   effect(TEMP_DEF dst, TEMP xtmp);
 8530   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8531   ins_encode %{
 8532     assert(UseSSE >= 3, "required");
 8533     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8534     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8535     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8536     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8537     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8538   %}
 8539   ins_pipe( pipe_slow );
 8540 %}
 8541 
 8542 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8543   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8544   match(Set dst (VectorStoreMask src size));
 8545   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8546   effect(TEMP_DEF dst, TEMP vtmp);
 8547   ins_encode %{
 8548     int vlen_enc = Assembler::AVX_128bit;
 8549     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8550     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8551     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8552     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8553     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8554     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8555     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8556   %}
 8557   ins_pipe( pipe_slow );
 8558 %}
 8559 
 8560 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8561   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8562   match(Set dst (VectorStoreMask src size));
 8563   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8564   ins_encode %{
 8565     int src_vlen_enc = vector_length_encoding(this, $src);
 8566     int dst_vlen_enc = vector_length_encoding(this);
 8567     if (!VM_Version::supports_avx512vl()) {
 8568       src_vlen_enc = Assembler::AVX_512bit;
 8569     }
 8570     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8571     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8572   %}
 8573   ins_pipe( pipe_slow );
 8574 %}
 8575 
 8576 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8577   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8578   match(Set dst (VectorStoreMask src size));
 8579   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8580   ins_encode %{
 8581     int src_vlen_enc = vector_length_encoding(this, $src);
 8582     int dst_vlen_enc = vector_length_encoding(this);
 8583     if (!VM_Version::supports_avx512vl()) {
 8584       src_vlen_enc = Assembler::AVX_512bit;
 8585     }
 8586     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8587     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8588   %}
 8589   ins_pipe( pipe_slow );
 8590 %}
 8591 
 8592 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8593   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8594   match(Set dst (VectorStoreMask mask size));
 8595   effect(TEMP_DEF dst);
 8596   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8597   ins_encode %{
 8598     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8599     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8600                  false, Assembler::AVX_512bit, noreg);
 8601     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8602   %}
 8603   ins_pipe( pipe_slow );
 8604 %}
 8605 
 8606 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8607   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8608   match(Set dst (VectorStoreMask mask size));
 8609   effect(TEMP_DEF dst);
 8610   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8611   ins_encode %{
 8612     int dst_vlen_enc = vector_length_encoding(this);
 8613     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8614     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8615   %}
 8616   ins_pipe( pipe_slow );
 8617 %}
 8618 
 8619 instruct vmaskcast_evex(kReg dst) %{
 8620   match(Set dst (VectorMaskCast dst));
 8621   ins_cost(0);
 8622   format %{ "vector_mask_cast $dst" %}
 8623   ins_encode %{
 8624     // empty
 8625   %}
 8626   ins_pipe(empty);
 8627 %}
 8628 
 8629 instruct vmaskcast(vec dst) %{
 8630   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8631   match(Set dst (VectorMaskCast dst));
 8632   ins_cost(0);
 8633   format %{ "vector_mask_cast $dst" %}
 8634   ins_encode %{
 8635     // empty
 8636   %}
 8637   ins_pipe(empty);
 8638 %}
 8639 
 8640 instruct vmaskcast_avx(vec dst, vec src) %{
 8641   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8642   match(Set dst (VectorMaskCast src));
 8643   format %{ "vector_mask_cast $dst, $src" %}
 8644   ins_encode %{
 8645     int vlen = Matcher::vector_length(this);
 8646     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8647     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8648     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8649   %}
 8650   ins_pipe(pipe_slow);
 8651 %}
 8652 
 8653 //-------------------------------- Load Iota Indices ----------------------------------
 8654 
 8655 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8656   match(Set dst (VectorLoadConst src));
 8657   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8658   ins_encode %{
 8659      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8660      BasicType bt = Matcher::vector_element_basic_type(this);
 8661      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8662   %}
 8663   ins_pipe( pipe_slow );
 8664 %}
 8665 
 8666 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8667   match(Set dst (PopulateIndex src1 src2));
 8668   effect(TEMP dst, TEMP vtmp);
 8669   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8670   ins_encode %{
 8671      assert($src2$$constant == 1, "required");
 8672      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8673      int vlen_enc = vector_length_encoding(this);
 8674      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8675      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8676      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8677      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8678   %}
 8679   ins_pipe( pipe_slow );
 8680 %}
 8681 
 8682 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8683   match(Set dst (PopulateIndex src1 src2));
 8684   effect(TEMP dst, TEMP vtmp);
 8685   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8686   ins_encode %{
 8687      assert($src2$$constant == 1, "required");
 8688      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8689      int vlen_enc = vector_length_encoding(this);
 8690      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8691      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8692      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8693      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8694   %}
 8695   ins_pipe( pipe_slow );
 8696 %}
 8697 
 8698 //-------------------------------- Rearrange ----------------------------------
 8699 
 8700 // LoadShuffle/Rearrange for Byte
 8701 instruct rearrangeB(vec dst, vec shuffle) %{
 8702   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8703             Matcher::vector_length(n) < 32);
 8704   match(Set dst (VectorRearrange dst shuffle));
 8705   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8706   ins_encode %{
 8707     assert(UseSSE >= 4, "required");
 8708     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8709   %}
 8710   ins_pipe( pipe_slow );
 8711 %}
 8712 
 8713 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8714   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8715             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8716   match(Set dst (VectorRearrange src shuffle));
 8717   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8718   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8719   ins_encode %{
 8720     assert(UseAVX >= 2, "required");
 8721     // Swap src into vtmp1
 8722     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8723     // Shuffle swapped src to get entries from other 128 bit lane
 8724     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8725     // Shuffle original src to get entries from self 128 bit lane
 8726     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8727     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8728     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8729     // Perform the blend
 8730     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8731   %}
 8732   ins_pipe( pipe_slow );
 8733 %}
 8734 
 8735 
 8736 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8737   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8738             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8739   match(Set dst (VectorRearrange src shuffle));
 8740   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8741   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8742   ins_encode %{
 8743     int vlen_enc = vector_length_encoding(this);
 8744     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8745                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8746                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8747   %}
 8748   ins_pipe( pipe_slow );
 8749 %}
 8750 
 8751 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8752   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8753             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8754   match(Set dst (VectorRearrange src shuffle));
 8755   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8756   ins_encode %{
 8757     int vlen_enc = vector_length_encoding(this);
 8758     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8759   %}
 8760   ins_pipe( pipe_slow );
 8761 %}
 8762 
 8763 // LoadShuffle/Rearrange for Short
 8764 
 8765 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8766   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8767             !VM_Version::supports_avx512bw());
 8768   match(Set dst (VectorLoadShuffle src));
 8769   effect(TEMP dst, TEMP vtmp);
 8770   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8771   ins_encode %{
 8772     // Create a byte shuffle mask from short shuffle mask
 8773     // only byte shuffle instruction available on these platforms
 8774     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8775     if (UseAVX == 0) {
 8776       assert(vlen_in_bytes <= 16, "required");
 8777       // Multiply each shuffle by two to get byte index
 8778       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8779       __ psllw($vtmp$$XMMRegister, 1);
 8780 
 8781       // Duplicate to create 2 copies of byte index
 8782       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8783       __ psllw($dst$$XMMRegister, 8);
 8784       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8785 
 8786       // Add one to get alternate byte index
 8787       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8788       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8789     } else {
 8790       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8791       int vlen_enc = vector_length_encoding(this);
 8792       // Multiply each shuffle by two to get byte index
 8793       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8794 
 8795       // Duplicate to create 2 copies of byte index
 8796       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8797       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8798 
 8799       // Add one to get alternate byte index
 8800       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8801     }
 8802   %}
 8803   ins_pipe( pipe_slow );
 8804 %}
 8805 
 8806 instruct rearrangeS(vec dst, vec shuffle) %{
 8807   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8808             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8809   match(Set dst (VectorRearrange dst shuffle));
 8810   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8811   ins_encode %{
 8812     assert(UseSSE >= 4, "required");
 8813     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8814   %}
 8815   ins_pipe( pipe_slow );
 8816 %}
 8817 
 8818 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8819   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8820             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8821   match(Set dst (VectorRearrange src shuffle));
 8822   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8823   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8824   ins_encode %{
 8825     assert(UseAVX >= 2, "required");
 8826     // Swap src into vtmp1
 8827     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8828     // Shuffle swapped src to get entries from other 128 bit lane
 8829     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8830     // Shuffle original src to get entries from self 128 bit lane
 8831     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8832     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8833     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8834     // Perform the blend
 8835     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8836   %}
 8837   ins_pipe( pipe_slow );
 8838 %}
 8839 
 8840 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8841   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8842             VM_Version::supports_avx512bw());
 8843   match(Set dst (VectorRearrange src shuffle));
 8844   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8845   ins_encode %{
 8846     int vlen_enc = vector_length_encoding(this);
 8847     if (!VM_Version::supports_avx512vl()) {
 8848       vlen_enc = Assembler::AVX_512bit;
 8849     }
 8850     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8851   %}
 8852   ins_pipe( pipe_slow );
 8853 %}
 8854 
 8855 // LoadShuffle/Rearrange for Integer and Float
 8856 
 8857 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8858   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8859             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8860   match(Set dst (VectorLoadShuffle src));
 8861   effect(TEMP dst, TEMP vtmp);
 8862   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8863   ins_encode %{
 8864     assert(UseSSE >= 4, "required");
 8865 
 8866     // Create a byte shuffle mask from int shuffle mask
 8867     // only byte shuffle instruction available on these platforms
 8868 
 8869     // Duplicate and multiply each shuffle by 4
 8870     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8871     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8872     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8873     __ psllw($vtmp$$XMMRegister, 2);
 8874 
 8875     // Duplicate again to create 4 copies of byte index
 8876     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8877     __ psllw($dst$$XMMRegister, 8);
 8878     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8879 
 8880     // Add 3,2,1,0 to get alternate byte index
 8881     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8882     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8883   %}
 8884   ins_pipe( pipe_slow );
 8885 %}
 8886 
 8887 instruct rearrangeI(vec dst, vec shuffle) %{
 8888   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8889             UseAVX == 0);
 8890   match(Set dst (VectorRearrange dst shuffle));
 8891   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8892   ins_encode %{
 8893     assert(UseSSE >= 4, "required");
 8894     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8895   %}
 8896   ins_pipe( pipe_slow );
 8897 %}
 8898 
 8899 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8900   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8901             UseAVX > 0);
 8902   match(Set dst (VectorRearrange src shuffle));
 8903   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8904   ins_encode %{
 8905     int vlen_enc = vector_length_encoding(this);
 8906     BasicType bt = Matcher::vector_element_basic_type(this);
 8907     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8908   %}
 8909   ins_pipe( pipe_slow );
 8910 %}
 8911 
 8912 // LoadShuffle/Rearrange for Long and Double
 8913 
 8914 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8915   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8916             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8917   match(Set dst (VectorLoadShuffle src));
 8918   effect(TEMP dst, TEMP vtmp);
 8919   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8920   ins_encode %{
 8921     assert(UseAVX >= 2, "required");
 8922 
 8923     int vlen_enc = vector_length_encoding(this);
 8924     // Create a double word shuffle mask from long shuffle mask
 8925     // only double word shuffle instruction available on these platforms
 8926 
 8927     // Multiply each shuffle by two to get double word index
 8928     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8929 
 8930     // Duplicate each double word shuffle
 8931     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8932     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8933 
 8934     // Add one to get alternate double word index
 8935     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8936   %}
 8937   ins_pipe( pipe_slow );
 8938 %}
 8939 
 8940 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8941   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8942             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8943   match(Set dst (VectorRearrange src shuffle));
 8944   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8945   ins_encode %{
 8946     assert(UseAVX >= 2, "required");
 8947 
 8948     int vlen_enc = vector_length_encoding(this);
 8949     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8950   %}
 8951   ins_pipe( pipe_slow );
 8952 %}
 8953 
 8954 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8955   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8956             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8957   match(Set dst (VectorRearrange src shuffle));
 8958   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8959   ins_encode %{
 8960     assert(UseAVX > 2, "required");
 8961 
 8962     int vlen_enc = vector_length_encoding(this);
 8963     if (vlen_enc == Assembler::AVX_128bit) {
 8964       vlen_enc = Assembler::AVX_256bit;
 8965     }
 8966     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8967   %}
 8968   ins_pipe( pipe_slow );
 8969 %}
 8970 
 8971 // --------------------------------- FMA --------------------------------------
 8972 // a * b + c
 8973 
 8974 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8975   match(Set c (FmaVF  c (Binary a b)));
 8976   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8977   ins_cost(150);
 8978   ins_encode %{
 8979     assert(UseFMA, "not enabled");
 8980     int vlen_enc = vector_length_encoding(this);
 8981     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8982   %}
 8983   ins_pipe( pipe_slow );
 8984 %}
 8985 
 8986 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8987   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8988   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8989   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8990   ins_cost(150);
 8991   ins_encode %{
 8992     assert(UseFMA, "not enabled");
 8993     int vlen_enc = vector_length_encoding(this);
 8994     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8995   %}
 8996   ins_pipe( pipe_slow );
 8997 %}
 8998 
 8999 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9000   match(Set c (FmaVD  c (Binary a b)));
 9001   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9002   ins_cost(150);
 9003   ins_encode %{
 9004     assert(UseFMA, "not enabled");
 9005     int vlen_enc = vector_length_encoding(this);
 9006     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9007   %}
 9008   ins_pipe( pipe_slow );
 9009 %}
 9010 
 9011 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9012   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9013   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9014   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9015   ins_cost(150);
 9016   ins_encode %{
 9017     assert(UseFMA, "not enabled");
 9018     int vlen_enc = vector_length_encoding(this);
 9019     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9020   %}
 9021   ins_pipe( pipe_slow );
 9022 %}
 9023 
 9024 // --------------------------------- Vector Multiply Add --------------------------------------
 9025 
 9026 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9027   predicate(UseAVX == 0);
 9028   match(Set dst (MulAddVS2VI dst src1));
 9029   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9030   ins_encode %{
 9031     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9032   %}
 9033   ins_pipe( pipe_slow );
 9034 %}
 9035 
 9036 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9037   predicate(UseAVX > 0);
 9038   match(Set dst (MulAddVS2VI src1 src2));
 9039   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9040   ins_encode %{
 9041     int vlen_enc = vector_length_encoding(this);
 9042     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9043   %}
 9044   ins_pipe( pipe_slow );
 9045 %}
 9046 
 9047 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9048 
 9049 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9050   predicate(VM_Version::supports_avx512_vnni());
 9051   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9052   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9053   ins_encode %{
 9054     assert(UseAVX > 2, "required");
 9055     int vlen_enc = vector_length_encoding(this);
 9056     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9057   %}
 9058   ins_pipe( pipe_slow );
 9059   ins_cost(10);
 9060 %}
 9061 
 9062 // --------------------------------- PopCount --------------------------------------
 9063 
 9064 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9065   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9066   match(Set dst (PopCountVI src));
 9067   match(Set dst (PopCountVL src));
 9068   format %{ "vector_popcount_integral $dst, $src" %}
 9069   ins_encode %{
 9070     int opcode = this->ideal_Opcode();
 9071     int vlen_enc = vector_length_encoding(this, $src);
 9072     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9073     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9074   %}
 9075   ins_pipe( pipe_slow );
 9076 %}
 9077 
 9078 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9079   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9080   match(Set dst (PopCountVI src mask));
 9081   match(Set dst (PopCountVL src mask));
 9082   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9083   ins_encode %{
 9084     int vlen_enc = vector_length_encoding(this, $src);
 9085     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9086     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9087     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9088   %}
 9089   ins_pipe( pipe_slow );
 9090 %}
 9091 
 9092 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9093   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9094   match(Set dst (PopCountVI src));
 9095   match(Set dst (PopCountVL src));
 9096   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9097   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9098   ins_encode %{
 9099     int opcode = this->ideal_Opcode();
 9100     int vlen_enc = vector_length_encoding(this, $src);
 9101     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9102     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9103                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9104   %}
 9105   ins_pipe( pipe_slow );
 9106 %}
 9107 
 9108 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9109 
 9110 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9111   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9112                                               Matcher::vector_length_in_bytes(n->in(1))));
 9113   match(Set dst (CountTrailingZerosV src));
 9114   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9115   ins_cost(400);
 9116   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9117   ins_encode %{
 9118     int vlen_enc = vector_length_encoding(this, $src);
 9119     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9120     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9121                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9122   %}
 9123   ins_pipe( pipe_slow );
 9124 %}
 9125 
 9126 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9127   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9128             VM_Version::supports_avx512cd() &&
 9129             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9130   match(Set dst (CountTrailingZerosV src));
 9131   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9132   ins_cost(400);
 9133   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9134   ins_encode %{
 9135     int vlen_enc = vector_length_encoding(this, $src);
 9136     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9137     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9138                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9139   %}
 9140   ins_pipe( pipe_slow );
 9141 %}
 9142 
 9143 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9144   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9145   match(Set dst (CountTrailingZerosV src));
 9146   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9147   ins_cost(400);
 9148   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9149   ins_encode %{
 9150     int vlen_enc = vector_length_encoding(this, $src);
 9151     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9152     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9153                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9154                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9155   %}
 9156   ins_pipe( pipe_slow );
 9157 %}
 9158 
 9159 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9160   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9161   match(Set dst (CountTrailingZerosV src));
 9162   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9163   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9164   ins_encode %{
 9165     int vlen_enc = vector_length_encoding(this, $src);
 9166     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9167     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9168                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9169   %}
 9170   ins_pipe( pipe_slow );
 9171 %}
 9172 
 9173 
 9174 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9175 
 9176 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9177   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9178   effect(TEMP dst);
 9179   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9180   ins_encode %{
 9181     int vector_len = vector_length_encoding(this);
 9182     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9183   %}
 9184   ins_pipe( pipe_slow );
 9185 %}
 9186 
 9187 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9188   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9189   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9190   effect(TEMP dst);
 9191   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9192   ins_encode %{
 9193     int vector_len = vector_length_encoding(this);
 9194     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9195   %}
 9196   ins_pipe( pipe_slow );
 9197 %}
 9198 
 9199 // --------------------------------- Rotation Operations ----------------------------------
 9200 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9201   match(Set dst (RotateLeftV src shift));
 9202   match(Set dst (RotateRightV src shift));
 9203   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9204   ins_encode %{
 9205     int opcode      = this->ideal_Opcode();
 9206     int vector_len  = vector_length_encoding(this);
 9207     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9208     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9209   %}
 9210   ins_pipe( pipe_slow );
 9211 %}
 9212 
 9213 instruct vprorate(vec dst, vec src, vec shift) %{
 9214   match(Set dst (RotateLeftV src shift));
 9215   match(Set dst (RotateRightV src shift));
 9216   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9217   ins_encode %{
 9218     int opcode      = this->ideal_Opcode();
 9219     int vector_len  = vector_length_encoding(this);
 9220     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9221     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9222   %}
 9223   ins_pipe( pipe_slow );
 9224 %}
 9225 
 9226 // ---------------------------------- Masked Operations ------------------------------------
 9227 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9228   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9229   match(Set dst (LoadVectorMasked mem mask));
 9230   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9231   ins_encode %{
 9232     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9233     int vlen_enc = vector_length_encoding(this);
 9234     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9235   %}
 9236   ins_pipe( pipe_slow );
 9237 %}
 9238 
 9239 
 9240 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9241   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9242   match(Set dst (LoadVectorMasked mem mask));
 9243   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9244   ins_encode %{
 9245     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9246     int vector_len = vector_length_encoding(this);
 9247     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9248   %}
 9249   ins_pipe( pipe_slow );
 9250 %}
 9251 
 9252 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9253   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9254   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9255   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9256   ins_encode %{
 9257     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9258     int vlen_enc = vector_length_encoding(src_node);
 9259     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9260     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9261   %}
 9262   ins_pipe( pipe_slow );
 9263 %}
 9264 
 9265 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9266   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9267   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9268   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9269   ins_encode %{
 9270     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9271     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9272     int vlen_enc = vector_length_encoding(src_node);
 9273     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9274   %}
 9275   ins_pipe( pipe_slow );
 9276 %}
 9277 
 9278 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9279   match(Set addr (VerifyVectorAlignment addr mask));
 9280   effect(KILL cr);
 9281   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9282   ins_encode %{
 9283     Label Lskip;
 9284     // check if masked bits of addr are zero
 9285     __ testq($addr$$Register, $mask$$constant);
 9286     __ jccb(Assembler::equal, Lskip);
 9287     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9288     __ bind(Lskip);
 9289   %}
 9290   ins_pipe(pipe_slow);
 9291 %}
 9292 
 9293 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9294   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9295   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9296   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9297   ins_encode %{
 9298     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9299     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9300 
 9301     Label DONE;
 9302     int vlen_enc = vector_length_encoding(this, $src1);
 9303     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9304 
 9305     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9306     __ mov64($dst$$Register, -1L);
 9307     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9308     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9309     __ jccb(Assembler::carrySet, DONE);
 9310     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9311     __ notq($dst$$Register);
 9312     __ tzcntq($dst$$Register, $dst$$Register);
 9313     __ bind(DONE);
 9314   %}
 9315   ins_pipe( pipe_slow );
 9316 %}
 9317 
 9318 
 9319 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9320   match(Set dst (VectorMaskGen len));
 9321   effect(TEMP temp, KILL cr);
 9322   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9323   ins_encode %{
 9324     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9325   %}
 9326   ins_pipe( pipe_slow );
 9327 %}
 9328 
 9329 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9330   match(Set dst (VectorMaskGen len));
 9331   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9332   effect(TEMP temp);
 9333   ins_encode %{
 9334     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9335     __ kmovql($dst$$KRegister, $temp$$Register);
 9336   %}
 9337   ins_pipe( pipe_slow );
 9338 %}
 9339 
 9340 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9341   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9342   match(Set dst (VectorMaskToLong mask));
 9343   effect(TEMP dst, KILL cr);
 9344   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9345   ins_encode %{
 9346     int opcode = this->ideal_Opcode();
 9347     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9348     int mask_len = Matcher::vector_length(this, $mask);
 9349     int mask_size = mask_len * type2aelembytes(mbt);
 9350     int vlen_enc = vector_length_encoding(this, $mask);
 9351     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9352                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9353   %}
 9354   ins_pipe( pipe_slow );
 9355 %}
 9356 
 9357 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9358   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9359   match(Set dst (VectorMaskToLong mask));
 9360   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9361   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9362   ins_encode %{
 9363     int opcode = this->ideal_Opcode();
 9364     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9365     int mask_len = Matcher::vector_length(this, $mask);
 9366     int vlen_enc = vector_length_encoding(this, $mask);
 9367     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9368                              $dst$$Register, mask_len, mbt, vlen_enc);
 9369   %}
 9370   ins_pipe( pipe_slow );
 9371 %}
 9372 
 9373 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9374   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9375   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9376   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9377   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9378   ins_encode %{
 9379     int opcode = this->ideal_Opcode();
 9380     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9381     int mask_len = Matcher::vector_length(this, $mask);
 9382     int vlen_enc = vector_length_encoding(this, $mask);
 9383     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9384                              $dst$$Register, mask_len, mbt, vlen_enc);
 9385   %}
 9386   ins_pipe( pipe_slow );
 9387 %}
 9388 
 9389 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9390   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9391   match(Set dst (VectorMaskTrueCount mask));
 9392   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9393   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9394   ins_encode %{
 9395     int opcode = this->ideal_Opcode();
 9396     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9397     int mask_len = Matcher::vector_length(this, $mask);
 9398     int mask_size = mask_len * type2aelembytes(mbt);
 9399     int vlen_enc = vector_length_encoding(this, $mask);
 9400     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9401                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9402   %}
 9403   ins_pipe( pipe_slow );
 9404 %}
 9405 
 9406 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9407   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9408   match(Set dst (VectorMaskTrueCount mask));
 9409   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9410   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9411   ins_encode %{
 9412     int opcode = this->ideal_Opcode();
 9413     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9414     int mask_len = Matcher::vector_length(this, $mask);
 9415     int vlen_enc = vector_length_encoding(this, $mask);
 9416     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9417                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9418   %}
 9419   ins_pipe( pipe_slow );
 9420 %}
 9421 
 9422 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9423   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9424   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9425   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9426   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9427   ins_encode %{
 9428     int opcode = this->ideal_Opcode();
 9429     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9430     int mask_len = Matcher::vector_length(this, $mask);
 9431     int vlen_enc = vector_length_encoding(this, $mask);
 9432     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9433                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9434   %}
 9435   ins_pipe( pipe_slow );
 9436 %}
 9437 
 9438 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9439   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9440   match(Set dst (VectorMaskFirstTrue mask));
 9441   match(Set dst (VectorMaskLastTrue mask));
 9442   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9443   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9444   ins_encode %{
 9445     int opcode = this->ideal_Opcode();
 9446     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9447     int mask_len = Matcher::vector_length(this, $mask);
 9448     int mask_size = mask_len * type2aelembytes(mbt);
 9449     int vlen_enc = vector_length_encoding(this, $mask);
 9450     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9451                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9452   %}
 9453   ins_pipe( pipe_slow );
 9454 %}
 9455 
 9456 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9457   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9458   match(Set dst (VectorMaskFirstTrue mask));
 9459   match(Set dst (VectorMaskLastTrue mask));
 9460   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9461   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9462   ins_encode %{
 9463     int opcode = this->ideal_Opcode();
 9464     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9465     int mask_len = Matcher::vector_length(this, $mask);
 9466     int vlen_enc = vector_length_encoding(this, $mask);
 9467     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9468                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9469   %}
 9470   ins_pipe( pipe_slow );
 9471 %}
 9472 
 9473 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9474   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9475   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9476   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9477   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9478   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9479   ins_encode %{
 9480     int opcode = this->ideal_Opcode();
 9481     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9482     int mask_len = Matcher::vector_length(this, $mask);
 9483     int vlen_enc = vector_length_encoding(this, $mask);
 9484     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9485                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9486   %}
 9487   ins_pipe( pipe_slow );
 9488 %}
 9489 
 9490 // --------------------------------- Compress/Expand Operations ---------------------------
 9491 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9492   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9493   match(Set dst (CompressV src mask));
 9494   match(Set dst (ExpandV src mask));
 9495   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9496   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9497   ins_encode %{
 9498     int opcode = this->ideal_Opcode();
 9499     int vlen_enc = vector_length_encoding(this);
 9500     BasicType bt  = Matcher::vector_element_basic_type(this);
 9501     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9502                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9503   %}
 9504   ins_pipe( pipe_slow );
 9505 %}
 9506 
 9507 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9508   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9509   match(Set dst (CompressV src mask));
 9510   match(Set dst (ExpandV src mask));
 9511   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9512   ins_encode %{
 9513     int opcode = this->ideal_Opcode();
 9514     int vector_len = vector_length_encoding(this);
 9515     BasicType bt  = Matcher::vector_element_basic_type(this);
 9516     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9517   %}
 9518   ins_pipe( pipe_slow );
 9519 %}
 9520 
 9521 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9522   match(Set dst (CompressM mask));
 9523   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9524   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9525   ins_encode %{
 9526     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9527     int mask_len = Matcher::vector_length(this);
 9528     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9529   %}
 9530   ins_pipe( pipe_slow );
 9531 %}
 9532 
 9533 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9534 
 9535 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9536   predicate(!VM_Version::supports_gfni());
 9537   match(Set dst (ReverseV src));
 9538   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9539   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9540   ins_encode %{
 9541     int vec_enc = vector_length_encoding(this);
 9542     BasicType bt = Matcher::vector_element_basic_type(this);
 9543     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9544                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9545   %}
 9546   ins_pipe( pipe_slow );
 9547 %}
 9548 
 9549 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9550   predicate(VM_Version::supports_gfni());
 9551   match(Set dst (ReverseV src));
 9552   effect(TEMP dst, TEMP xtmp);
 9553   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9554   ins_encode %{
 9555     int vec_enc = vector_length_encoding(this);
 9556     BasicType bt  = Matcher::vector_element_basic_type(this);
 9557     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9558     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9559                                $xtmp$$XMMRegister);
 9560   %}
 9561   ins_pipe( pipe_slow );
 9562 %}
 9563 
 9564 instruct vreverse_byte_reg(vec dst, vec src) %{
 9565   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9566   match(Set dst (ReverseBytesV src));
 9567   effect(TEMP dst);
 9568   format %{ "vector_reverse_byte $dst, $src" %}
 9569   ins_encode %{
 9570     int vec_enc = vector_length_encoding(this);
 9571     BasicType bt = Matcher::vector_element_basic_type(this);
 9572     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9573   %}
 9574   ins_pipe( pipe_slow );
 9575 %}
 9576 
 9577 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9578   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9579   match(Set dst (ReverseBytesV src));
 9580   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9581   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9582   ins_encode %{
 9583     int vec_enc = vector_length_encoding(this);
 9584     BasicType bt = Matcher::vector_element_basic_type(this);
 9585     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9586                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9587   %}
 9588   ins_pipe( pipe_slow );
 9589 %}
 9590 
 9591 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9592 
 9593 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9594   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9595                                               Matcher::vector_length_in_bytes(n->in(1))));
 9596   match(Set dst (CountLeadingZerosV src));
 9597   format %{ "vector_count_leading_zeros $dst, $src" %}
 9598   ins_encode %{
 9599      int vlen_enc = vector_length_encoding(this, $src);
 9600      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9601      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9602                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9603   %}
 9604   ins_pipe( pipe_slow );
 9605 %}
 9606 
 9607 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9608   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9609                                               Matcher::vector_length_in_bytes(n->in(1))));
 9610   match(Set dst (CountLeadingZerosV src mask));
 9611   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9612   ins_encode %{
 9613     int vlen_enc = vector_length_encoding(this, $src);
 9614     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9615     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9616     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9617                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9618   %}
 9619   ins_pipe( pipe_slow );
 9620 %}
 9621 
 9622 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9623   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9624             VM_Version::supports_avx512cd() &&
 9625             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9626   match(Set dst (CountLeadingZerosV src));
 9627   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9628   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9629   ins_encode %{
 9630     int vlen_enc = vector_length_encoding(this, $src);
 9631     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9632     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9633                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9634   %}
 9635   ins_pipe( pipe_slow );
 9636 %}
 9637 
 9638 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9639   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9640   match(Set dst (CountLeadingZerosV src));
 9641   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9642   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9643   ins_encode %{
 9644     int vlen_enc = vector_length_encoding(this, $src);
 9645     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9646     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9647                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9648                                        $rtmp$$Register, true, vlen_enc);
 9649   %}
 9650   ins_pipe( pipe_slow );
 9651 %}
 9652 
 9653 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9654   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9655             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9656   match(Set dst (CountLeadingZerosV src));
 9657   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9658   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9659   ins_encode %{
 9660     int vlen_enc = vector_length_encoding(this, $src);
 9661     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9662     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9663                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9664   %}
 9665   ins_pipe( pipe_slow );
 9666 %}
 9667 
 9668 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9669   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9670             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9671   match(Set dst (CountLeadingZerosV src));
 9672   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9673   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9674   ins_encode %{
 9675     int vlen_enc = vector_length_encoding(this, $src);
 9676     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9677     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9678                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9679   %}
 9680   ins_pipe( pipe_slow );
 9681 %}
 9682 
 9683 // ---------------------------------- Vector Masked Operations ------------------------------------
 9684 
 9685 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9686   match(Set dst (AddVB (Binary dst src2) mask));
 9687   match(Set dst (AddVS (Binary dst src2) mask));
 9688   match(Set dst (AddVI (Binary dst src2) mask));
 9689   match(Set dst (AddVL (Binary dst src2) mask));
 9690   match(Set dst (AddVF (Binary dst src2) mask));
 9691   match(Set dst (AddVD (Binary dst src2) mask));
 9692   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9693   ins_encode %{
 9694     int vlen_enc = vector_length_encoding(this);
 9695     BasicType bt = Matcher::vector_element_basic_type(this);
 9696     int opc = this->ideal_Opcode();
 9697     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9698                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9699   %}
 9700   ins_pipe( pipe_slow );
 9701 %}
 9702 
 9703 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9704   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9705   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9706   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9707   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9708   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9709   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9710   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9711   ins_encode %{
 9712     int vlen_enc = vector_length_encoding(this);
 9713     BasicType bt = Matcher::vector_element_basic_type(this);
 9714     int opc = this->ideal_Opcode();
 9715     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9716                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9717   %}
 9718   ins_pipe( pipe_slow );
 9719 %}
 9720 
 9721 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9722   match(Set dst (XorV (Binary dst src2) mask));
 9723   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9724   ins_encode %{
 9725     int vlen_enc = vector_length_encoding(this);
 9726     BasicType bt = Matcher::vector_element_basic_type(this);
 9727     int opc = this->ideal_Opcode();
 9728     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9729                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9730   %}
 9731   ins_pipe( pipe_slow );
 9732 %}
 9733 
 9734 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9735   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9736   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9737   ins_encode %{
 9738     int vlen_enc = vector_length_encoding(this);
 9739     BasicType bt = Matcher::vector_element_basic_type(this);
 9740     int opc = this->ideal_Opcode();
 9741     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9742                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9743   %}
 9744   ins_pipe( pipe_slow );
 9745 %}
 9746 
 9747 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9748   match(Set dst (OrV (Binary dst src2) mask));
 9749   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9750   ins_encode %{
 9751     int vlen_enc = vector_length_encoding(this);
 9752     BasicType bt = Matcher::vector_element_basic_type(this);
 9753     int opc = this->ideal_Opcode();
 9754     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9755                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9756   %}
 9757   ins_pipe( pipe_slow );
 9758 %}
 9759 
 9760 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9761   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9762   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9763   ins_encode %{
 9764     int vlen_enc = vector_length_encoding(this);
 9765     BasicType bt = Matcher::vector_element_basic_type(this);
 9766     int opc = this->ideal_Opcode();
 9767     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9768                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9769   %}
 9770   ins_pipe( pipe_slow );
 9771 %}
 9772 
 9773 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9774   match(Set dst (AndV (Binary dst src2) mask));
 9775   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9776   ins_encode %{
 9777     int vlen_enc = vector_length_encoding(this);
 9778     BasicType bt = Matcher::vector_element_basic_type(this);
 9779     int opc = this->ideal_Opcode();
 9780     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9781                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9782   %}
 9783   ins_pipe( pipe_slow );
 9784 %}
 9785 
 9786 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9787   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9788   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9789   ins_encode %{
 9790     int vlen_enc = vector_length_encoding(this);
 9791     BasicType bt = Matcher::vector_element_basic_type(this);
 9792     int opc = this->ideal_Opcode();
 9793     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9794                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9795   %}
 9796   ins_pipe( pipe_slow );
 9797 %}
 9798 
 9799 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9800   match(Set dst (SubVB (Binary dst src2) mask));
 9801   match(Set dst (SubVS (Binary dst src2) mask));
 9802   match(Set dst (SubVI (Binary dst src2) mask));
 9803   match(Set dst (SubVL (Binary dst src2) mask));
 9804   match(Set dst (SubVF (Binary dst src2) mask));
 9805   match(Set dst (SubVD (Binary dst src2) mask));
 9806   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9807   ins_encode %{
 9808     int vlen_enc = vector_length_encoding(this);
 9809     BasicType bt = Matcher::vector_element_basic_type(this);
 9810     int opc = this->ideal_Opcode();
 9811     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9812                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9813   %}
 9814   ins_pipe( pipe_slow );
 9815 %}
 9816 
 9817 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9818   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9819   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9820   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9821   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9822   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9823   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9824   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9825   ins_encode %{
 9826     int vlen_enc = vector_length_encoding(this);
 9827     BasicType bt = Matcher::vector_element_basic_type(this);
 9828     int opc = this->ideal_Opcode();
 9829     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9830                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9831   %}
 9832   ins_pipe( pipe_slow );
 9833 %}
 9834 
 9835 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9836   match(Set dst (MulVS (Binary dst src2) mask));
 9837   match(Set dst (MulVI (Binary dst src2) mask));
 9838   match(Set dst (MulVL (Binary dst src2) mask));
 9839   match(Set dst (MulVF (Binary dst src2) mask));
 9840   match(Set dst (MulVD (Binary dst src2) mask));
 9841   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9842   ins_encode %{
 9843     int vlen_enc = vector_length_encoding(this);
 9844     BasicType bt = Matcher::vector_element_basic_type(this);
 9845     int opc = this->ideal_Opcode();
 9846     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9847                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9848   %}
 9849   ins_pipe( pipe_slow );
 9850 %}
 9851 
 9852 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9853   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9854   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9855   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9856   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9857   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9858   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9859   ins_encode %{
 9860     int vlen_enc = vector_length_encoding(this);
 9861     BasicType bt = Matcher::vector_element_basic_type(this);
 9862     int opc = this->ideal_Opcode();
 9863     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9864                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9865   %}
 9866   ins_pipe( pipe_slow );
 9867 %}
 9868 
 9869 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9870   match(Set dst (SqrtVF dst mask));
 9871   match(Set dst (SqrtVD dst mask));
 9872   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9873   ins_encode %{
 9874     int vlen_enc = vector_length_encoding(this);
 9875     BasicType bt = Matcher::vector_element_basic_type(this);
 9876     int opc = this->ideal_Opcode();
 9877     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9878                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9879   %}
 9880   ins_pipe( pipe_slow );
 9881 %}
 9882 
 9883 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9884   match(Set dst (DivVF (Binary dst src2) mask));
 9885   match(Set dst (DivVD (Binary dst src2) mask));
 9886   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9887   ins_encode %{
 9888     int vlen_enc = vector_length_encoding(this);
 9889     BasicType bt = Matcher::vector_element_basic_type(this);
 9890     int opc = this->ideal_Opcode();
 9891     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9892                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9893   %}
 9894   ins_pipe( pipe_slow );
 9895 %}
 9896 
 9897 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9898   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9899   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9900   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9901   ins_encode %{
 9902     int vlen_enc = vector_length_encoding(this);
 9903     BasicType bt = Matcher::vector_element_basic_type(this);
 9904     int opc = this->ideal_Opcode();
 9905     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9906                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9907   %}
 9908   ins_pipe( pipe_slow );
 9909 %}
 9910 
 9911 
 9912 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9913   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9914   match(Set dst (RotateRightV (Binary dst shift) mask));
 9915   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9916   ins_encode %{
 9917     int vlen_enc = vector_length_encoding(this);
 9918     BasicType bt = Matcher::vector_element_basic_type(this);
 9919     int opc = this->ideal_Opcode();
 9920     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9921                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9922   %}
 9923   ins_pipe( pipe_slow );
 9924 %}
 9925 
 9926 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9927   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9928   match(Set dst (RotateRightV (Binary dst src2) mask));
 9929   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9930   ins_encode %{
 9931     int vlen_enc = vector_length_encoding(this);
 9932     BasicType bt = Matcher::vector_element_basic_type(this);
 9933     int opc = this->ideal_Opcode();
 9934     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9935                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9936   %}
 9937   ins_pipe( pipe_slow );
 9938 %}
 9939 
 9940 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9941   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9942   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9943   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9944   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9945   ins_encode %{
 9946     int vlen_enc = vector_length_encoding(this);
 9947     BasicType bt = Matcher::vector_element_basic_type(this);
 9948     int opc = this->ideal_Opcode();
 9949     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9950                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9951   %}
 9952   ins_pipe( pipe_slow );
 9953 %}
 9954 
 9955 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9956   predicate(!n->as_ShiftV()->is_var_shift());
 9957   match(Set dst (LShiftVS (Binary dst src2) mask));
 9958   match(Set dst (LShiftVI (Binary dst src2) mask));
 9959   match(Set dst (LShiftVL (Binary dst src2) mask));
 9960   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9961   ins_encode %{
 9962     int vlen_enc = vector_length_encoding(this);
 9963     BasicType bt = Matcher::vector_element_basic_type(this);
 9964     int opc = this->ideal_Opcode();
 9965     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9966                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9967   %}
 9968   ins_pipe( pipe_slow );
 9969 %}
 9970 
 9971 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9972   predicate(n->as_ShiftV()->is_var_shift());
 9973   match(Set dst (LShiftVS (Binary dst src2) mask));
 9974   match(Set dst (LShiftVI (Binary dst src2) mask));
 9975   match(Set dst (LShiftVL (Binary dst src2) mask));
 9976   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9977   ins_encode %{
 9978     int vlen_enc = vector_length_encoding(this);
 9979     BasicType bt = Matcher::vector_element_basic_type(this);
 9980     int opc = this->ideal_Opcode();
 9981     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9982                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9983   %}
 9984   ins_pipe( pipe_slow );
 9985 %}
 9986 
 9987 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9988   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9989   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9990   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9991   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9992   ins_encode %{
 9993     int vlen_enc = vector_length_encoding(this);
 9994     BasicType bt = Matcher::vector_element_basic_type(this);
 9995     int opc = this->ideal_Opcode();
 9996     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9997                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9998   %}
 9999   ins_pipe( pipe_slow );
10000 %}
10001 
10002 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10003   predicate(!n->as_ShiftV()->is_var_shift());
10004   match(Set dst (RShiftVS (Binary dst src2) mask));
10005   match(Set dst (RShiftVI (Binary dst src2) mask));
10006   match(Set dst (RShiftVL (Binary dst src2) mask));
10007   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10008   ins_encode %{
10009     int vlen_enc = vector_length_encoding(this);
10010     BasicType bt = Matcher::vector_element_basic_type(this);
10011     int opc = this->ideal_Opcode();
10012     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10013                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10014   %}
10015   ins_pipe( pipe_slow );
10016 %}
10017 
10018 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10019   predicate(n->as_ShiftV()->is_var_shift());
10020   match(Set dst (RShiftVS (Binary dst src2) mask));
10021   match(Set dst (RShiftVI (Binary dst src2) mask));
10022   match(Set dst (RShiftVL (Binary dst src2) mask));
10023   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10024   ins_encode %{
10025     int vlen_enc = vector_length_encoding(this);
10026     BasicType bt = Matcher::vector_element_basic_type(this);
10027     int opc = this->ideal_Opcode();
10028     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10029                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10030   %}
10031   ins_pipe( pipe_slow );
10032 %}
10033 
10034 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10035   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10036   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10037   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10038   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10039   ins_encode %{
10040     int vlen_enc = vector_length_encoding(this);
10041     BasicType bt = Matcher::vector_element_basic_type(this);
10042     int opc = this->ideal_Opcode();
10043     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10044                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10045   %}
10046   ins_pipe( pipe_slow );
10047 %}
10048 
10049 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10050   predicate(!n->as_ShiftV()->is_var_shift());
10051   match(Set dst (URShiftVS (Binary dst src2) mask));
10052   match(Set dst (URShiftVI (Binary dst src2) mask));
10053   match(Set dst (URShiftVL (Binary dst src2) mask));
10054   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10055   ins_encode %{
10056     int vlen_enc = vector_length_encoding(this);
10057     BasicType bt = Matcher::vector_element_basic_type(this);
10058     int opc = this->ideal_Opcode();
10059     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10060                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10061   %}
10062   ins_pipe( pipe_slow );
10063 %}
10064 
10065 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10066   predicate(n->as_ShiftV()->is_var_shift());
10067   match(Set dst (URShiftVS (Binary dst src2) mask));
10068   match(Set dst (URShiftVI (Binary dst src2) mask));
10069   match(Set dst (URShiftVL (Binary dst src2) mask));
10070   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10071   ins_encode %{
10072     int vlen_enc = vector_length_encoding(this);
10073     BasicType bt = Matcher::vector_element_basic_type(this);
10074     int opc = this->ideal_Opcode();
10075     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10076                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10077   %}
10078   ins_pipe( pipe_slow );
10079 %}
10080 
10081 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10082   match(Set dst (MaxV (Binary dst src2) mask));
10083   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10084   ins_encode %{
10085     int vlen_enc = vector_length_encoding(this);
10086     BasicType bt = Matcher::vector_element_basic_type(this);
10087     int opc = this->ideal_Opcode();
10088     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10089                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10090   %}
10091   ins_pipe( pipe_slow );
10092 %}
10093 
10094 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10095   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10096   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10097   ins_encode %{
10098     int vlen_enc = vector_length_encoding(this);
10099     BasicType bt = Matcher::vector_element_basic_type(this);
10100     int opc = this->ideal_Opcode();
10101     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10102                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10103   %}
10104   ins_pipe( pipe_slow );
10105 %}
10106 
10107 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10108   match(Set dst (MinV (Binary dst src2) mask));
10109   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10110   ins_encode %{
10111     int vlen_enc = vector_length_encoding(this);
10112     BasicType bt = Matcher::vector_element_basic_type(this);
10113     int opc = this->ideal_Opcode();
10114     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10115                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10116   %}
10117   ins_pipe( pipe_slow );
10118 %}
10119 
10120 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10121   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10122   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10123   ins_encode %{
10124     int vlen_enc = vector_length_encoding(this);
10125     BasicType bt = Matcher::vector_element_basic_type(this);
10126     int opc = this->ideal_Opcode();
10127     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10128                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10129   %}
10130   ins_pipe( pipe_slow );
10131 %}
10132 
10133 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10134   match(Set dst (VectorRearrange (Binary dst src2) mask));
10135   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10136   ins_encode %{
10137     int vlen_enc = vector_length_encoding(this);
10138     BasicType bt = Matcher::vector_element_basic_type(this);
10139     int opc = this->ideal_Opcode();
10140     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10141                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10142   %}
10143   ins_pipe( pipe_slow );
10144 %}
10145 
10146 instruct vabs_masked(vec dst, kReg mask) %{
10147   match(Set dst (AbsVB dst mask));
10148   match(Set dst (AbsVS dst mask));
10149   match(Set dst (AbsVI dst mask));
10150   match(Set dst (AbsVL dst mask));
10151   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10152   ins_encode %{
10153     int vlen_enc = vector_length_encoding(this);
10154     BasicType bt = Matcher::vector_element_basic_type(this);
10155     int opc = this->ideal_Opcode();
10156     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10157                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10158   %}
10159   ins_pipe( pipe_slow );
10160 %}
10161 
10162 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10163   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10164   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10165   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10166   ins_encode %{
10167     assert(UseFMA, "Needs FMA instructions support.");
10168     int vlen_enc = vector_length_encoding(this);
10169     BasicType bt = Matcher::vector_element_basic_type(this);
10170     int opc = this->ideal_Opcode();
10171     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10172                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10173   %}
10174   ins_pipe( pipe_slow );
10175 %}
10176 
10177 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10178   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10179   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10180   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10181   ins_encode %{
10182     assert(UseFMA, "Needs FMA instructions support.");
10183     int vlen_enc = vector_length_encoding(this);
10184     BasicType bt = Matcher::vector_element_basic_type(this);
10185     int opc = this->ideal_Opcode();
10186     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10187                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10188   %}
10189   ins_pipe( pipe_slow );
10190 %}
10191 
10192 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10193   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10194   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10195   ins_encode %{
10196     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10197     int vlen_enc = vector_length_encoding(this, $src1);
10198     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10199 
10200     // Comparison i
10201     switch (src1_elem_bt) {
10202       case T_BYTE: {
10203         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10204         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10205         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10206         break;
10207       }
10208       case T_SHORT: {
10209         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10210         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10211         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10212         break;
10213       }
10214       case T_INT: {
10215         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10216         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10217         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10218         break;
10219       }
10220       case T_LONG: {
10221         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10222         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10223         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10224         break;
10225       }
10226       case T_FLOAT: {
10227         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10228         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10229         break;
10230       }
10231       case T_DOUBLE: {
10232         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10233         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10234         break;
10235       }
10236       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10237     }
10238   %}
10239   ins_pipe( pipe_slow );
10240 %}
10241 
10242 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10243   predicate(Matcher::vector_length(n) <= 32);
10244   match(Set dst (MaskAll src));
10245   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10246   ins_encode %{
10247     int mask_len = Matcher::vector_length(this);
10248     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10249   %}
10250   ins_pipe( pipe_slow );
10251 %}
10252 
10253 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10254   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10255   match(Set dst (XorVMask src (MaskAll cnt)));
10256   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10257   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10258   ins_encode %{
10259     uint masklen = Matcher::vector_length(this);
10260     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10261   %}
10262   ins_pipe( pipe_slow );
10263 %}
10264 
10265 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10266   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10267             (Matcher::vector_length(n) == 16) ||
10268             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10269   match(Set dst (XorVMask src (MaskAll cnt)));
10270   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10271   ins_encode %{
10272     uint masklen = Matcher::vector_length(this);
10273     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10274   %}
10275   ins_pipe( pipe_slow );
10276 %}
10277 
10278 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10279   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10280   match(Set dst (VectorLongToMask src));
10281   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10282   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10283   ins_encode %{
10284     int mask_len = Matcher::vector_length(this);
10285     int vec_enc  = vector_length_encoding(mask_len);
10286     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10287                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10288   %}
10289   ins_pipe( pipe_slow );
10290 %}
10291 
10292 
10293 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10294   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10295   match(Set dst (VectorLongToMask src));
10296   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10297   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10298   ins_encode %{
10299     int mask_len = Matcher::vector_length(this);
10300     assert(mask_len <= 32, "invalid mask length");
10301     int vec_enc  = vector_length_encoding(mask_len);
10302     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10303                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10304   %}
10305   ins_pipe( pipe_slow );
10306 %}
10307 
10308 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10309   predicate(n->bottom_type()->isa_vectmask());
10310   match(Set dst (VectorLongToMask src));
10311   format %{ "long_to_mask_evex $dst, $src\t!" %}
10312   ins_encode %{
10313     __ kmov($dst$$KRegister, $src$$Register);
10314   %}
10315   ins_pipe( pipe_slow );
10316 %}
10317 
10318 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10319   match(Set dst (AndVMask src1 src2));
10320   match(Set dst (OrVMask src1 src2));
10321   match(Set dst (XorVMask src1 src2));
10322   effect(TEMP kscratch);
10323   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10324   ins_encode %{
10325     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10326     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10327     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10328     uint masklen = Matcher::vector_length(this);
10329     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10330     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10331   %}
10332   ins_pipe( pipe_slow );
10333 %}
10334 
10335 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10336   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10337   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10338   ins_encode %{
10339     int vlen_enc = vector_length_encoding(this);
10340     BasicType bt = Matcher::vector_element_basic_type(this);
10341     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10342                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10343   %}
10344   ins_pipe( pipe_slow );
10345 %}
10346 
10347 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10348   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10349   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10350   ins_encode %{
10351     int vlen_enc = vector_length_encoding(this);
10352     BasicType bt = Matcher::vector_element_basic_type(this);
10353     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10354                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10355   %}
10356   ins_pipe( pipe_slow );
10357 %}
10358 
10359 instruct castMM(kReg dst)
10360 %{
10361   match(Set dst (CastVV dst));
10362 
10363   size(0);
10364   format %{ "# castVV of $dst" %}
10365   ins_encode(/* empty encoding */);
10366   ins_cost(0);
10367   ins_pipe(empty);
10368 %}
10369 
10370 instruct castVV(vec dst)
10371 %{
10372   match(Set dst (CastVV dst));
10373 
10374   size(0);
10375   format %{ "# castVV of $dst" %}
10376   ins_encode(/* empty encoding */);
10377   ins_cost(0);
10378   ins_pipe(empty);
10379 %}
10380 
10381 instruct castVVLeg(legVec dst)
10382 %{
10383   match(Set dst (CastVV dst));
10384 
10385   size(0);
10386   format %{ "# castVV of $dst" %}
10387   ins_encode(/* empty encoding */);
10388   ins_cost(0);
10389   ins_pipe(empty);
10390 %}
10391 
10392 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10393 %{
10394   match(Set dst (IsInfiniteF src));
10395   effect(TEMP ktmp, KILL cr);
10396   format %{ "float_class_check $dst, $src" %}
10397   ins_encode %{
10398     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10399     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10400   %}
10401   ins_pipe(pipe_slow);
10402 %}
10403 
10404 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10405 %{
10406   match(Set dst (IsInfiniteD src));
10407   effect(TEMP ktmp, KILL cr);
10408   format %{ "double_class_check $dst, $src" %}
10409   ins_encode %{
10410     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10411     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10412   %}
10413   ins_pipe(pipe_slow);
10414 %}
10415 
10416 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10417 %{
10418   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10419             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10420   match(Set dst (SaturatingAddV src1 src2));
10421   match(Set dst (SaturatingSubV src1 src2));
10422   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10423   ins_encode %{
10424     int vlen_enc = vector_length_encoding(this);
10425     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10426     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10427                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10428   %}
10429   ins_pipe(pipe_slow);
10430 %}
10431 
10432 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10433 %{
10434   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10435             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10436   match(Set dst (SaturatingAddV src1 src2));
10437   match(Set dst (SaturatingSubV src1 src2));
10438   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10439   ins_encode %{
10440     int vlen_enc = vector_length_encoding(this);
10441     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10442     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10443                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10444   %}
10445   ins_pipe(pipe_slow);
10446 %}
10447 
10448 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10449 %{
10450   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10451             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10452             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10453   match(Set dst (SaturatingAddV src1 src2));
10454   match(Set dst (SaturatingSubV src1 src2));
10455   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10456   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10457   ins_encode %{
10458     int vlen_enc = vector_length_encoding(this);
10459     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10460     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10461                                         $src1$$XMMRegister, $src2$$XMMRegister,
10462                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10463                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10464   %}
10465   ins_pipe(pipe_slow);
10466 %}
10467 
10468 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10469 %{
10470   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10471             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10472             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10473   match(Set dst (SaturatingAddV src1 src2));
10474   match(Set dst (SaturatingSubV src1 src2));
10475   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10476   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10477   ins_encode %{
10478     int vlen_enc = vector_length_encoding(this);
10479     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10480     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10481                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10482                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10483   %}
10484   ins_pipe(pipe_slow);
10485 %}
10486 
10487 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10488 %{
10489   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10490             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10491             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10492   match(Set dst (SaturatingAddV src1 src2));
10493   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10494   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10495   ins_encode %{
10496     int vlen_enc = vector_length_encoding(this);
10497     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10498     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10499                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10500   %}
10501   ins_pipe(pipe_slow);
10502 %}
10503 
10504 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10505 %{
10506   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10507             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10508             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10509   match(Set dst (SaturatingAddV src1 src2));
10510   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10511   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10512   ins_encode %{
10513     int vlen_enc = vector_length_encoding(this);
10514     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10515     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10516                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10517   %}
10518   ins_pipe(pipe_slow);
10519 %}
10520 
10521 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10522 %{
10523   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10524             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10525             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10526   match(Set dst (SaturatingSubV src1 src2));
10527   effect(TEMP ktmp);
10528   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10529   ins_encode %{
10530     int vlen_enc = vector_length_encoding(this);
10531     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10532     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10533                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10534   %}
10535   ins_pipe(pipe_slow);
10536 %}
10537 
10538 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10539 %{
10540   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10541             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10542             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10543   match(Set dst (SaturatingSubV src1 src2));
10544   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10545   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10546   ins_encode %{
10547     int vlen_enc = vector_length_encoding(this);
10548     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10549     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10550                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10551   %}
10552   ins_pipe(pipe_slow);
10553 %}
10554 
10555 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10556 %{
10557   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10558             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10559   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10560   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10561   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10562   ins_encode %{
10563     int vlen_enc = vector_length_encoding(this);
10564     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10565     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10566                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10567   %}
10568   ins_pipe(pipe_slow);
10569 %}
10570 
10571 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10572 %{
10573   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10574             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10575   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10576   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10577   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10578   ins_encode %{
10579     int vlen_enc = vector_length_encoding(this);
10580     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10581     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10582                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10583   %}
10584   ins_pipe(pipe_slow);
10585 %}
10586 
10587 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10588   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10589             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10590   match(Set dst (SaturatingAddV (Binary dst src) mask));
10591   match(Set dst (SaturatingSubV (Binary dst src) mask));
10592   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10593   ins_encode %{
10594     int vlen_enc = vector_length_encoding(this);
10595     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10596     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10597                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10598   %}
10599   ins_pipe( pipe_slow );
10600 %}
10601 
10602 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10603   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10604             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10605   match(Set dst (SaturatingAddV (Binary dst src) mask));
10606   match(Set dst (SaturatingSubV (Binary dst src) mask));
10607   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10608   ins_encode %{
10609     int vlen_enc = vector_length_encoding(this);
10610     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10611     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10612                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10613   %}
10614   ins_pipe( pipe_slow );
10615 %}
10616 
10617 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10618   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10619             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10620   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10621   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10622   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10623   ins_encode %{
10624     int vlen_enc = vector_length_encoding(this);
10625     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10626     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10627                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10628   %}
10629   ins_pipe( pipe_slow );
10630 %}
10631 
10632 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10633   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10634             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10635   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10636   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10637   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10638   ins_encode %{
10639     int vlen_enc = vector_length_encoding(this);
10640     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10641     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10642                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10643   %}
10644   ins_pipe( pipe_slow );
10645 %}
10646 
10647 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10648 %{
10649   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10650   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10651   ins_encode %{
10652     int vlen_enc = vector_length_encoding(this);
10653     BasicType bt = Matcher::vector_element_basic_type(this);
10654     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10655   %}
10656   ins_pipe(pipe_slow);
10657 %}
10658 
10659 instruct reinterpretS2HF(regF dst, rRegI src)
10660 %{
10661   match(Set dst (ReinterpretS2HF src));
10662   format %{ "vmovw $dst, $src" %}
10663   ins_encode %{
10664     __ vmovw($dst$$XMMRegister, $src$$Register);
10665   %}
10666   ins_pipe(pipe_slow);
10667 %}
10668 
10669 instruct reinterpretHF2S(rRegI dst, regF src)
10670 %{
10671   match(Set dst (ReinterpretHF2S src));
10672   format %{ "vmovw $dst, $src" %}
10673   ins_encode %{
10674     __ vmovw($dst$$Register, $src$$XMMRegister);
10675   %}
10676   ins_pipe(pipe_slow);
10677 %}
10678 
10679 instruct convF2HFAndS2HF(regF dst, regF src)
10680 %{
10681   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10682   format %{ "convF2HFAndS2HF $dst, $src" %}
10683   ins_encode %{
10684     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10685   %}
10686   ins_pipe(pipe_slow);
10687 %}
10688 
10689 instruct convHF2SAndHF2F(regF dst, regF src)
10690 %{
10691   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10692   format %{ "convHF2SAndHF2F $dst, $src" %}
10693   ins_encode %{
10694     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10695   %}
10696   ins_pipe(pipe_slow);
10697 %}
10698 
10699 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10700 %{
10701   match(Set dst (SqrtHF src));
10702   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10703   ins_encode %{
10704     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10705   %}
10706   ins_pipe(pipe_slow);
10707 %}
10708 
10709 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10710 %{
10711   match(Set dst (AddHF src1 src2));
10712   match(Set dst (DivHF src1 src2));
10713   match(Set dst (MulHF src1 src2));
10714   match(Set dst (SubHF src1 src2));
10715   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10716   ins_encode %{
10717     int opcode = this->ideal_Opcode();
10718     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10719   %}
10720   ins_pipe(pipe_slow);
10721 %}
10722 
10723 instruct scalar_minmax_HF_avx10_reg(regF dst, regF src1, regF src2)
10724 %{
10725   predicate(VM_Version::supports_avx10_2());
10726   match(Set dst (MaxHF src1 src2));
10727   match(Set dst (MinHF src1 src2));
10728   format %{ "scalar_min_max_fp16 $dst, $src1, $src2" %}
10729   ins_encode %{
10730     int function = this->ideal_Opcode() == Op_MinHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10731     __ eminmaxsh($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, function);
10732   %}
10733   ins_pipe( pipe_slow );
10734 %}
10735 
10736 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10737 %{
10738   predicate(!VM_Version::supports_avx10_2());
10739   match(Set dst (MaxHF src1 src2));
10740   match(Set dst (MinHF src1 src2));
10741   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10742   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10743   ins_encode %{
10744     int opcode = this->ideal_Opcode();
10745     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10746                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10747   %}
10748   ins_pipe( pipe_slow );
10749 %}
10750 
10751 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10752 %{
10753   match(Set dst (FmaHF  src2 (Binary dst src1)));
10754   effect(DEF dst);
10755   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10756   ins_encode %{
10757     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10758   %}
10759   ins_pipe( pipe_slow );
10760 %}
10761 
10762 
10763 instruct vector_sqrt_HF_reg(vec dst, vec src)
10764 %{
10765   match(Set dst (SqrtVHF src));
10766   format %{ "vector_sqrt_fp16 $dst, $src" %}
10767   ins_encode %{
10768     int vlen_enc = vector_length_encoding(this);
10769     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10770   %}
10771   ins_pipe(pipe_slow);
10772 %}
10773 
10774 instruct vector_sqrt_HF_mem(vec dst, memory src)
10775 %{
10776   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10777   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10778   ins_encode %{
10779     int vlen_enc = vector_length_encoding(this);
10780     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10781   %}
10782   ins_pipe(pipe_slow);
10783 %}
10784 
10785 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10786 %{
10787   match(Set dst (AddVHF src1 src2));
10788   match(Set dst (DivVHF src1 src2));
10789   match(Set dst (MulVHF src1 src2));
10790   match(Set dst (SubVHF src1 src2));
10791   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10792   ins_encode %{
10793     int vlen_enc = vector_length_encoding(this);
10794     int opcode = this->ideal_Opcode();
10795     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10796   %}
10797   ins_pipe(pipe_slow);
10798 %}
10799 
10800 
10801 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10802 %{
10803   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10804   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10805   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10806   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10807   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10808   ins_encode %{
10809     int vlen_enc = vector_length_encoding(this);
10810     int opcode = this->ideal_Opcode();
10811     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10812   %}
10813   ins_pipe(pipe_slow);
10814 %}
10815 
10816 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10817 %{
10818   match(Set dst (FmaVHF src2 (Binary dst src1)));
10819   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10820   ins_encode %{
10821     int vlen_enc = vector_length_encoding(this);
10822     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10823   %}
10824   ins_pipe( pipe_slow );
10825 %}
10826 
10827 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10828 %{
10829   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10830   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10831   ins_encode %{
10832     int vlen_enc = vector_length_encoding(this);
10833     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10834   %}
10835   ins_pipe( pipe_slow );
10836 %}
10837 
10838 instruct vector_minmax_HF_avx10_mem(vec dst, vec src1, memory src2)
10839 %{
10840   predicate(VM_Version::supports_avx10_2());
10841   match(Set dst (MinVHF src1 (VectorReinterpret (LoadVector src2))));
10842   match(Set dst (MaxVHF src1 (VectorReinterpret (LoadVector src2))));
10843   format %{ "vector_min_max_fp16_mem $dst, $src1, $src2" %}
10844   ins_encode %{
10845     int vlen_enc = vector_length_encoding(this);
10846     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10847     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$Address, true, function, vlen_enc);
10848   %}
10849   ins_pipe( pipe_slow );
10850 %}
10851 
10852 instruct vector_minmax_HF_avx10_reg(vec dst, vec src1, vec src2)
10853 %{
10854   predicate(VM_Version::supports_avx10_2());
10855   match(Set dst (MinVHF src1 src2));
10856   match(Set dst (MaxVHF src1 src2));
10857   format %{ "vector_min_max_fp16 $dst, $src1, $src2" %}
10858   ins_encode %{
10859     int vlen_enc = vector_length_encoding(this);
10860     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10861     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, true, function, vlen_enc);
10862   %}
10863   ins_pipe( pipe_slow );
10864 %}
10865 
10866 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
10867 %{
10868   predicate(!VM_Version::supports_avx10_2());
10869   match(Set dst (MinVHF src1 src2));
10870   match(Set dst (MaxVHF src1 src2));
10871   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10872   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10873   ins_encode %{
10874     int vlen_enc = vector_length_encoding(this);
10875     int opcode = this->ideal_Opcode();
10876     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
10877                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10878   %}
10879   ins_pipe( pipe_slow );
10880 %}