1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1835          return false;
 1836        }
 1837        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1838          return false;
 1839        }
 1840        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1841          return false;
 1842        }
 1843        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1844          return false;
 1845        }
 1846        break;
 1847     case Op_MaskAll:
 1848       if (!VM_Version::supports_evex()) {
 1849         return false;
 1850       }
 1851       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1852         return false;
 1853       }
 1854       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1855         return false;
 1856       }
 1857       break;
 1858     case Op_VectorMaskCmp:
 1859       if (vlen < 2 || size_in_bits < 32) {
 1860         return false;
 1861       }
 1862       break;
 1863     case Op_CompressM:
 1864       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1865         return false;
 1866       }
 1867       break;
 1868     case Op_CompressV:
 1869     case Op_ExpandV:
 1870       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1871         return false;
 1872       }
 1873       if (size_in_bits < 128 ) {
 1874         return false;
 1875       }
 1876     case Op_VectorLongToMask:
 1877       if (UseAVX < 1) {
 1878         return false;
 1879       }
 1880       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1881         return false;
 1882       }
 1883       break;
 1884     case Op_SignumVD:
 1885     case Op_SignumVF:
 1886       if (UseAVX < 1) {
 1887         return false;
 1888       }
 1889       break;
 1890     case Op_PopCountVI:
 1891     case Op_PopCountVL: {
 1892         if (!is_pop_count_instr_target(bt) &&
 1893             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1894           return false;
 1895         }
 1896       }
 1897       break;
 1898     case Op_ReverseV:
 1899     case Op_ReverseBytesV:
 1900       if (UseAVX < 2) {
 1901         return false;
 1902       }
 1903       break;
 1904     case Op_CountTrailingZerosV:
 1905     case Op_CountLeadingZerosV:
 1906       if (UseAVX < 2) {
 1907         return false;
 1908       }
 1909       break;
 1910   }
 1911   return true;  // Per default match rules are supported.
 1912 }
 1913 
 1914 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1915   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1916   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1917   // of their non-masked counterpart with mask edge being the differentiator.
 1918   // This routine does a strict check on the existence of masked operation patterns
 1919   // by returning a default false value for all the other opcodes apart from the
 1920   // ones whose masked instruction patterns are defined in this file.
 1921   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1922     return false;
 1923   }
 1924 
 1925   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1926   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1927     return false;
 1928   }
 1929   switch(opcode) {
 1930     // Unary masked operations
 1931     case Op_AbsVB:
 1932     case Op_AbsVS:
 1933       if(!VM_Version::supports_avx512bw()) {
 1934         return false;  // Implementation limitation
 1935       }
 1936     case Op_AbsVI:
 1937     case Op_AbsVL:
 1938       return true;
 1939 
 1940     // Ternary masked operations
 1941     case Op_FmaVF:
 1942     case Op_FmaVD:
 1943       return true;
 1944 
 1945     case Op_MacroLogicV:
 1946       if(bt != T_INT && bt != T_LONG) {
 1947         return false;
 1948       }
 1949       return true;
 1950 
 1951     // Binary masked operations
 1952     case Op_AddVB:
 1953     case Op_AddVS:
 1954     case Op_SubVB:
 1955     case Op_SubVS:
 1956     case Op_MulVS:
 1957     case Op_LShiftVS:
 1958     case Op_RShiftVS:
 1959     case Op_URShiftVS:
 1960       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1961       if (!VM_Version::supports_avx512bw()) {
 1962         return false;  // Implementation limitation
 1963       }
 1964       return true;
 1965 
 1966     case Op_MulVL:
 1967       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1968       if (!VM_Version::supports_avx512dq()) {
 1969         return false;  // Implementation limitation
 1970       }
 1971       return true;
 1972 
 1973     case Op_AndV:
 1974     case Op_OrV:
 1975     case Op_XorV:
 1976     case Op_RotateRightV:
 1977     case Op_RotateLeftV:
 1978       if (bt != T_INT && bt != T_LONG) {
 1979         return false; // Implementation limitation
 1980       }
 1981       return true;
 1982 
 1983     case Op_VectorLoadMask:
 1984       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1985       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1986         return false;
 1987       }
 1988       return true;
 1989 
 1990     case Op_AddVI:
 1991     case Op_AddVL:
 1992     case Op_AddVF:
 1993     case Op_AddVD:
 1994     case Op_SubVI:
 1995     case Op_SubVL:
 1996     case Op_SubVF:
 1997     case Op_SubVD:
 1998     case Op_MulVI:
 1999     case Op_MulVF:
 2000     case Op_MulVD:
 2001     case Op_DivVF:
 2002     case Op_DivVD:
 2003     case Op_SqrtVF:
 2004     case Op_SqrtVD:
 2005     case Op_LShiftVI:
 2006     case Op_LShiftVL:
 2007     case Op_RShiftVI:
 2008     case Op_RShiftVL:
 2009     case Op_URShiftVI:
 2010     case Op_URShiftVL:
 2011     case Op_LoadVectorMasked:
 2012     case Op_StoreVectorMasked:
 2013     case Op_LoadVectorGatherMasked:
 2014     case Op_StoreVectorScatterMasked:
 2015       return true;
 2016 
 2017     case Op_UMinV:
 2018     case Op_UMaxV:
 2019       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2020         return false;
 2021       } // fallthrough
 2022     case Op_MaxV:
 2023     case Op_MinV:
 2024       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2025         return false; // Implementation limitation
 2026       }
 2027       if (is_floating_point_type(bt)) {
 2028         return false; // Implementation limitation
 2029       }
 2030       return true;
 2031     case Op_SaturatingAddV:
 2032     case Op_SaturatingSubV:
 2033       if (!is_subword_type(bt)) {
 2034         return false;
 2035       }
 2036       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2037         return false; // Implementation limitation
 2038       }
 2039       return true;
 2040 
 2041     case Op_VectorMaskCmp:
 2042       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2043         return false; // Implementation limitation
 2044       }
 2045       return true;
 2046 
 2047     case Op_VectorRearrange:
 2048       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2049         return false; // Implementation limitation
 2050       }
 2051       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2052         return false; // Implementation limitation
 2053       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2054         return false; // Implementation limitation
 2055       }
 2056       return true;
 2057 
 2058     // Binary Logical operations
 2059     case Op_AndVMask:
 2060     case Op_OrVMask:
 2061     case Op_XorVMask:
 2062       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2063         return false; // Implementation limitation
 2064       }
 2065       return true;
 2066 
 2067     case Op_PopCountVI:
 2068     case Op_PopCountVL:
 2069       if (!is_pop_count_instr_target(bt)) {
 2070         return false;
 2071       }
 2072       return true;
 2073 
 2074     case Op_MaskAll:
 2075       return true;
 2076 
 2077     case Op_CountLeadingZerosV:
 2078       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2079         return true;
 2080       }
 2081     default:
 2082       return false;
 2083   }
 2084 }
 2085 
 2086 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2087   return false;
 2088 }
 2089 
 2090 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2091 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2092   switch (elem_bt) {
 2093     case T_BYTE:  return false;
 2094     case T_SHORT: return !VM_Version::supports_avx512bw();
 2095     case T_INT:   return !VM_Version::supports_avx();
 2096     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2097     default:
 2098       ShouldNotReachHere();
 2099       return false;
 2100   }
 2101 }
 2102 
 2103 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2104   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2105   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2106   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2107       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2108     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2109     return new legVecZOper();
 2110   }
 2111   if (legacy) {
 2112     switch (ideal_reg) {
 2113       case Op_VecS: return new legVecSOper();
 2114       case Op_VecD: return new legVecDOper();
 2115       case Op_VecX: return new legVecXOper();
 2116       case Op_VecY: return new legVecYOper();
 2117       case Op_VecZ: return new legVecZOper();
 2118     }
 2119   } else {
 2120     switch (ideal_reg) {
 2121       case Op_VecS: return new vecSOper();
 2122       case Op_VecD: return new vecDOper();
 2123       case Op_VecX: return new vecXOper();
 2124       case Op_VecY: return new vecYOper();
 2125       case Op_VecZ: return new vecZOper();
 2126     }
 2127   }
 2128   ShouldNotReachHere();
 2129   return nullptr;
 2130 }
 2131 
 2132 bool Matcher::is_reg2reg_move(MachNode* m) {
 2133   switch (m->rule()) {
 2134     case MoveVec2Leg_rule:
 2135     case MoveLeg2Vec_rule:
 2136     case MoveF2VL_rule:
 2137     case MoveF2LEG_rule:
 2138     case MoveVL2F_rule:
 2139     case MoveLEG2F_rule:
 2140     case MoveD2VL_rule:
 2141     case MoveD2LEG_rule:
 2142     case MoveVL2D_rule:
 2143     case MoveLEG2D_rule:
 2144       return true;
 2145     default:
 2146       return false;
 2147   }
 2148 }
 2149 
 2150 bool Matcher::is_generic_vector(MachOper* opnd) {
 2151   switch (opnd->opcode()) {
 2152     case VEC:
 2153     case LEGVEC:
 2154       return true;
 2155     default:
 2156       return false;
 2157   }
 2158 }
 2159 
 2160 //------------------------------------------------------------------------
 2161 
 2162 const RegMask* Matcher::predicate_reg_mask(void) {
 2163   return &_VECTMASK_REG_mask;
 2164 }
 2165 
 2166 // Max vector size in bytes. 0 if not supported.
 2167 int Matcher::vector_width_in_bytes(BasicType bt) {
 2168   assert(is_java_primitive(bt), "only primitive type vectors");
 2169   // SSE2 supports 128bit vectors for all types.
 2170   // AVX2 supports 256bit vectors for all types.
 2171   // AVX2/EVEX supports 512bit vectors for all types.
 2172   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2173   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2174   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2175     size = (UseAVX > 2) ? 64 : 32;
 2176   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2177     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2178   // Use flag to limit vector size.
 2179   size = MIN2(size,(int)MaxVectorSize);
 2180   // Minimum 2 values in vector (or 4 for bytes).
 2181   switch (bt) {
 2182   case T_DOUBLE:
 2183   case T_LONG:
 2184     if (size < 16) return 0;
 2185     break;
 2186   case T_FLOAT:
 2187   case T_INT:
 2188     if (size < 8) return 0;
 2189     break;
 2190   case T_BOOLEAN:
 2191     if (size < 4) return 0;
 2192     break;
 2193   case T_CHAR:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_BYTE:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_SHORT:
 2200     if (size < 4) return 0;
 2201     break;
 2202   default:
 2203     ShouldNotReachHere();
 2204   }
 2205   return size;
 2206 }
 2207 
 2208 // Limits on vector size (number of elements) loaded into vector.
 2209 int Matcher::max_vector_size(const BasicType bt) {
 2210   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2211 }
 2212 int Matcher::min_vector_size(const BasicType bt) {
 2213   int max_size = max_vector_size(bt);
 2214   // Min size which can be loaded into vector is 4 bytes.
 2215   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2216   // Support for calling svml double64 vectors
 2217   if (bt == T_DOUBLE) {
 2218     size = 1;
 2219   }
 2220   return MIN2(size,max_size);
 2221 }
 2222 
 2223 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2224   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2225   // by default on Cascade Lake
 2226   if (VM_Version::is_default_intel_cascade_lake()) {
 2227     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2228   }
 2229   return Matcher::max_vector_size(bt);
 2230 }
 2231 
 2232 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2233   return -1;
 2234 }
 2235 
 2236 // Vector ideal reg corresponding to specified size in bytes
 2237 uint Matcher::vector_ideal_reg(int size) {
 2238   assert(MaxVectorSize >= size, "");
 2239   switch(size) {
 2240     case  4: return Op_VecS;
 2241     case  8: return Op_VecD;
 2242     case 16: return Op_VecX;
 2243     case 32: return Op_VecY;
 2244     case 64: return Op_VecZ;
 2245   }
 2246   ShouldNotReachHere();
 2247   return 0;
 2248 }
 2249 
 2250 // Check for shift by small constant as well
 2251 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2252   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2253       shift->in(2)->get_int() <= 3 &&
 2254       // Are there other uses besides address expressions?
 2255       !matcher->is_visited(shift)) {
 2256     address_visited.set(shift->_idx); // Flag as address_visited
 2257     mstack.push(shift->in(2), Matcher::Visit);
 2258     Node *conv = shift->in(1);
 2259     // Allow Matcher to match the rule which bypass
 2260     // ConvI2L operation for an array index on LP64
 2261     // if the index value is positive.
 2262     if (conv->Opcode() == Op_ConvI2L &&
 2263         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2264         // Are there other uses besides address expressions?
 2265         !matcher->is_visited(conv)) {
 2266       address_visited.set(conv->_idx); // Flag as address_visited
 2267       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2268     } else {
 2269       mstack.push(conv, Matcher::Pre_Visit);
 2270     }
 2271     return true;
 2272   }
 2273   return false;
 2274 }
 2275 
 2276 // This function identifies sub-graphs in which a 'load' node is
 2277 // input to two different nodes, and such that it can be matched
 2278 // with BMI instructions like blsi, blsr, etc.
 2279 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2280 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2281 // refers to the same node.
 2282 //
 2283 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2284 // This is a temporary solution until we make DAGs expressible in ADL.
 2285 template<typename ConType>
 2286 class FusedPatternMatcher {
 2287   Node* _op1_node;
 2288   Node* _mop_node;
 2289   int _con_op;
 2290 
 2291   static int match_next(Node* n, int next_op, int next_op_idx) {
 2292     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2293       return -1;
 2294     }
 2295 
 2296     if (next_op_idx == -1) { // n is commutative, try rotations
 2297       if (n->in(1)->Opcode() == next_op) {
 2298         return 1;
 2299       } else if (n->in(2)->Opcode() == next_op) {
 2300         return 2;
 2301       }
 2302     } else {
 2303       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2304       if (n->in(next_op_idx)->Opcode() == next_op) {
 2305         return next_op_idx;
 2306       }
 2307     }
 2308     return -1;
 2309   }
 2310 
 2311  public:
 2312   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2313     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2314 
 2315   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2316              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2317              typename ConType::NativeType con_value) {
 2318     if (_op1_node->Opcode() != op1) {
 2319       return false;
 2320     }
 2321     if (_mop_node->outcnt() > 2) {
 2322       return false;
 2323     }
 2324     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2325     if (op1_op2_idx == -1) {
 2326       return false;
 2327     }
 2328     // Memory operation must be the other edge
 2329     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2330 
 2331     // Check that the mop node is really what we want
 2332     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2333       Node* op2_node = _op1_node->in(op1_op2_idx);
 2334       if (op2_node->outcnt() > 1) {
 2335         return false;
 2336       }
 2337       assert(op2_node->Opcode() == op2, "Should be");
 2338       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2339       if (op2_con_idx == -1) {
 2340         return false;
 2341       }
 2342       // Memory operation must be the other edge
 2343       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2344       // Check that the memory operation is the same node
 2345       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2346         // Now check the constant
 2347         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2348         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2349           return true;
 2350         }
 2351       }
 2352     }
 2353     return false;
 2354   }
 2355 };
 2356 
 2357 static bool is_bmi_pattern(Node* n, Node* m) {
 2358   assert(UseBMI1Instructions, "sanity");
 2359   if (n != nullptr && m != nullptr) {
 2360     if (m->Opcode() == Op_LoadI) {
 2361       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2362       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2363              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2364              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2365     } else if (m->Opcode() == Op_LoadL) {
 2366       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2367       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2368              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2369              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2370     }
 2371   }
 2372   return false;
 2373 }
 2374 
 2375 // Should the matcher clone input 'm' of node 'n'?
 2376 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2377   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2378   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2379     mstack.push(m, Visit);
 2380     return true;
 2381   }
 2382   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2383     mstack.push(m, Visit);           // m = ShiftCntV
 2384     return true;
 2385   }
 2386   if (is_encode_and_store_pattern(n, m)) {
 2387     mstack.push(m, Visit);
 2388     return true;
 2389   }
 2390   return false;
 2391 }
 2392 
 2393 // Should the Matcher clone shifts on addressing modes, expecting them
 2394 // to be subsumed into complex addressing expressions or compute them
 2395 // into registers?
 2396 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2397   Node *off = m->in(AddPNode::Offset);
 2398   if (off->is_Con()) {
 2399     address_visited.test_set(m->_idx); // Flag as address_visited
 2400     Node *adr = m->in(AddPNode::Address);
 2401 
 2402     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2403     // AtomicAdd is not an addressing expression.
 2404     // Cheap to find it by looking for screwy base.
 2405     if (adr->is_AddP() &&
 2406         !adr->in(AddPNode::Base)->is_top() &&
 2407         !adr->in(AddPNode::Offset)->is_Con() &&
 2408         off->get_long() == (int) (off->get_long()) && // immL32
 2409         // Are there other uses besides address expressions?
 2410         !is_visited(adr)) {
 2411       address_visited.set(adr->_idx); // Flag as address_visited
 2412       Node *shift = adr->in(AddPNode::Offset);
 2413       if (!clone_shift(shift, this, mstack, address_visited)) {
 2414         mstack.push(shift, Pre_Visit);
 2415       }
 2416       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2417       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2418     } else {
 2419       mstack.push(adr, Pre_Visit);
 2420     }
 2421 
 2422     // Clone X+offset as it also folds into most addressing expressions
 2423     mstack.push(off, Visit);
 2424     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2425     return true;
 2426   } else if (clone_shift(off, this, mstack, address_visited)) {
 2427     address_visited.test_set(m->_idx); // Flag as address_visited
 2428     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2429     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2430     return true;
 2431   }
 2432   return false;
 2433 }
 2434 
 2435 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2436   switch (bt) {
 2437     case BoolTest::eq:
 2438       return Assembler::eq;
 2439     case BoolTest::ne:
 2440       return Assembler::neq;
 2441     case BoolTest::le:
 2442     case BoolTest::ule:
 2443       return Assembler::le;
 2444     case BoolTest::ge:
 2445     case BoolTest::uge:
 2446       return Assembler::nlt;
 2447     case BoolTest::lt:
 2448     case BoolTest::ult:
 2449       return Assembler::lt;
 2450     case BoolTest::gt:
 2451     case BoolTest::ugt:
 2452       return Assembler::nle;
 2453     default : ShouldNotReachHere(); return Assembler::_false;
 2454   }
 2455 }
 2456 
 2457 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2458   switch (bt) {
 2459   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2460   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2461   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2462   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2463   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2464   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2465   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2466   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2467   }
 2468 }
 2469 
 2470 // Helper methods for MachSpillCopyNode::implementation().
 2471 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2472                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2473   assert(ireg == Op_VecS || // 32bit vector
 2474          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2475           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2476          "no non-adjacent vector moves" );
 2477   if (masm) {
 2478     switch (ireg) {
 2479     case Op_VecS: // copy whole register
 2480     case Op_VecD:
 2481     case Op_VecX:
 2482       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2483         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2484       } else {
 2485         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2486      }
 2487       break;
 2488     case Op_VecY:
 2489       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2490         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2491       } else {
 2492         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2493      }
 2494       break;
 2495     case Op_VecZ:
 2496       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2497       break;
 2498     default:
 2499       ShouldNotReachHere();
 2500     }
 2501 #ifndef PRODUCT
 2502   } else {
 2503     switch (ireg) {
 2504     case Op_VecS:
 2505     case Op_VecD:
 2506     case Op_VecX:
 2507       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2508       break;
 2509     case Op_VecY:
 2510     case Op_VecZ:
 2511       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2512       break;
 2513     default:
 2514       ShouldNotReachHere();
 2515     }
 2516 #endif
 2517   }
 2518 }
 2519 
 2520 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2521                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2522   if (masm) {
 2523     if (is_load) {
 2524       switch (ireg) {
 2525       case Op_VecS:
 2526         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2527         break;
 2528       case Op_VecD:
 2529         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecX:
 2532         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2533           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2534         } else {
 2535           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2536           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2537         }
 2538         break;
 2539       case Op_VecY:
 2540         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2541           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2542         } else {
 2543           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2544           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2545         }
 2546         break;
 2547       case Op_VecZ:
 2548         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2549         break;
 2550       default:
 2551         ShouldNotReachHere();
 2552       }
 2553     } else { // store
 2554       switch (ireg) {
 2555       case Op_VecS:
 2556         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2557         break;
 2558       case Op_VecD:
 2559         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecX:
 2562         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2563           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2564         }
 2565         else {
 2566           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2567         }
 2568         break;
 2569       case Op_VecY:
 2570         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2571           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2572         }
 2573         else {
 2574           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2575         }
 2576         break;
 2577       case Op_VecZ:
 2578         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2579         break;
 2580       default:
 2581         ShouldNotReachHere();
 2582       }
 2583     }
 2584 #ifndef PRODUCT
 2585   } else {
 2586     if (is_load) {
 2587       switch (ireg) {
 2588       case Op_VecS:
 2589         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2590         break;
 2591       case Op_VecD:
 2592         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594        case Op_VecX:
 2595         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597       case Op_VecY:
 2598       case Op_VecZ:
 2599         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2600         break;
 2601       default:
 2602         ShouldNotReachHere();
 2603       }
 2604     } else { // store
 2605       switch (ireg) {
 2606       case Op_VecS:
 2607         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2608         break;
 2609       case Op_VecD:
 2610         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612        case Op_VecX:
 2613         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615       case Op_VecY:
 2616       case Op_VecZ:
 2617         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2618         break;
 2619       default:
 2620         ShouldNotReachHere();
 2621       }
 2622     }
 2623 #endif
 2624   }
 2625 }
 2626 
 2627 template <class T>
 2628 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2629   int size = type2aelembytes(bt) * len;
 2630   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2631   for (int i = 0; i < len; i++) {
 2632     int offset = i * type2aelembytes(bt);
 2633     switch (bt) {
 2634       case T_BYTE: val->at(i) = con; break;
 2635       case T_SHORT: {
 2636         jshort c = con;
 2637         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2638         break;
 2639       }
 2640       case T_INT: {
 2641         jint c = con;
 2642         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2643         break;
 2644       }
 2645       case T_LONG: {
 2646         jlong c = con;
 2647         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2648         break;
 2649       }
 2650       case T_FLOAT: {
 2651         jfloat c = con;
 2652         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2653         break;
 2654       }
 2655       case T_DOUBLE: {
 2656         jdouble c = con;
 2657         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2658         break;
 2659       }
 2660       default: assert(false, "%s", type2name(bt));
 2661     }
 2662   }
 2663   return val;
 2664 }
 2665 
 2666 static inline jlong high_bit_set(BasicType bt) {
 2667   switch (bt) {
 2668     case T_BYTE:  return 0x8080808080808080;
 2669     case T_SHORT: return 0x8000800080008000;
 2670     case T_INT:   return 0x8000000080000000;
 2671     case T_LONG:  return 0x8000000000000000;
 2672     default:
 2673       ShouldNotReachHere();
 2674       return 0;
 2675   }
 2676 }
 2677 
 2678 #ifndef PRODUCT
 2679   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2680     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2681   }
 2682 #endif
 2683 
 2684   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2685     __ nop(_count);
 2686   }
 2687 
 2688   uint MachNopNode::size(PhaseRegAlloc*) const {
 2689     return _count;
 2690   }
 2691 
 2692 #ifndef PRODUCT
 2693   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2694     st->print("# breakpoint");
 2695   }
 2696 #endif
 2697 
 2698   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2699     __ int3();
 2700   }
 2701 
 2702   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2703     return MachNode::size(ra_);
 2704   }
 2705 
 2706 %}
 2707 
 2708 encode %{
 2709 
 2710   enc_class call_epilog %{
 2711     if (VerifyStackAtCalls) {
 2712       // Check that stack depth is unchanged: find majik cookie on stack
 2713       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2714       Label L;
 2715       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2716       __ jccb(Assembler::equal, L);
 2717       // Die if stack mismatch
 2718       __ int3();
 2719       __ bind(L);
 2720     }
 2721   %}
 2722 
 2723 %}
 2724 
 2725 // Operands for bound floating pointer register arguments
 2726 operand rxmm0() %{
 2727   constraint(ALLOC_IN_RC(xmm0_reg));
 2728   match(VecX);
 2729   format%{%}
 2730   interface(REG_INTER);
 2731 %}
 2732 
 2733 //----------OPERANDS-----------------------------------------------------------
 2734 // Operand definitions must precede instruction definitions for correct parsing
 2735 // in the ADLC because operands constitute user defined types which are used in
 2736 // instruction definitions.
 2737 
 2738 // Vectors
 2739 
 2740 // Dummy generic vector class. Should be used for all vector operands.
 2741 // Replaced with vec[SDXYZ] during post-selection pass.
 2742 operand vec() %{
 2743   constraint(ALLOC_IN_RC(dynamic));
 2744   match(VecX);
 2745   match(VecY);
 2746   match(VecZ);
 2747   match(VecS);
 2748   match(VecD);
 2749 
 2750   format %{ %}
 2751   interface(REG_INTER);
 2752 %}
 2753 
 2754 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2755 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2756 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2757 // runtime code generation via reg_class_dynamic.
 2758 operand legVec() %{
 2759   constraint(ALLOC_IN_RC(dynamic));
 2760   match(VecX);
 2761   match(VecY);
 2762   match(VecZ);
 2763   match(VecS);
 2764   match(VecD);
 2765 
 2766   format %{ %}
 2767   interface(REG_INTER);
 2768 %}
 2769 
 2770 // Replaces vec during post-selection cleanup. See above.
 2771 operand vecS() %{
 2772   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2773   match(VecS);
 2774 
 2775   format %{ %}
 2776   interface(REG_INTER);
 2777 %}
 2778 
 2779 // Replaces legVec during post-selection cleanup. See above.
 2780 operand legVecS() %{
 2781   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2782   match(VecS);
 2783 
 2784   format %{ %}
 2785   interface(REG_INTER);
 2786 %}
 2787 
 2788 // Replaces vec during post-selection cleanup. See above.
 2789 operand vecD() %{
 2790   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2791   match(VecD);
 2792 
 2793   format %{ %}
 2794   interface(REG_INTER);
 2795 %}
 2796 
 2797 // Replaces legVec during post-selection cleanup. See above.
 2798 operand legVecD() %{
 2799   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2800   match(VecD);
 2801 
 2802   format %{ %}
 2803   interface(REG_INTER);
 2804 %}
 2805 
 2806 // Replaces vec during post-selection cleanup. See above.
 2807 operand vecX() %{
 2808   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2809   match(VecX);
 2810 
 2811   format %{ %}
 2812   interface(REG_INTER);
 2813 %}
 2814 
 2815 // Replaces legVec during post-selection cleanup. See above.
 2816 operand legVecX() %{
 2817   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2818   match(VecX);
 2819 
 2820   format %{ %}
 2821   interface(REG_INTER);
 2822 %}
 2823 
 2824 // Replaces vec during post-selection cleanup. See above.
 2825 operand vecY() %{
 2826   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2827   match(VecY);
 2828 
 2829   format %{ %}
 2830   interface(REG_INTER);
 2831 %}
 2832 
 2833 // Replaces legVec during post-selection cleanup. See above.
 2834 operand legVecY() %{
 2835   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2836   match(VecY);
 2837 
 2838   format %{ %}
 2839   interface(REG_INTER);
 2840 %}
 2841 
 2842 // Replaces vec during post-selection cleanup. See above.
 2843 operand vecZ() %{
 2844   constraint(ALLOC_IN_RC(vectorz_reg));
 2845   match(VecZ);
 2846 
 2847   format %{ %}
 2848   interface(REG_INTER);
 2849 %}
 2850 
 2851 // Replaces legVec during post-selection cleanup. See above.
 2852 operand legVecZ() %{
 2853   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2854   match(VecZ);
 2855 
 2856   format %{ %}
 2857   interface(REG_INTER);
 2858 %}
 2859 
 2860 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2861 
 2862 // ============================================================================
 2863 
 2864 instruct ShouldNotReachHere() %{
 2865   match(Halt);
 2866   format %{ "stop\t# ShouldNotReachHere" %}
 2867   ins_encode %{
 2868     if (is_reachable()) {
 2869       const char* str = __ code_string(_halt_reason);
 2870       __ stop(str);
 2871     }
 2872   %}
 2873   ins_pipe(pipe_slow);
 2874 %}
 2875 
 2876 // ============================================================================
 2877 
 2878 instruct addF_reg(regF dst, regF src) %{
 2879   predicate(UseAVX == 0);
 2880   match(Set dst (AddF dst src));
 2881 
 2882   format %{ "addss   $dst, $src" %}
 2883   ins_cost(150);
 2884   ins_encode %{
 2885     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2886   %}
 2887   ins_pipe(pipe_slow);
 2888 %}
 2889 
 2890 instruct addF_mem(regF dst, memory src) %{
 2891   predicate(UseAVX == 0);
 2892   match(Set dst (AddF dst (LoadF src)));
 2893 
 2894   format %{ "addss   $dst, $src" %}
 2895   ins_cost(150);
 2896   ins_encode %{
 2897     __ addss($dst$$XMMRegister, $src$$Address);
 2898   %}
 2899   ins_pipe(pipe_slow);
 2900 %}
 2901 
 2902 instruct addF_imm(regF dst, immF con) %{
 2903   predicate(UseAVX == 0);
 2904   match(Set dst (AddF dst con));
 2905   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2906   ins_cost(150);
 2907   ins_encode %{
 2908     __ addss($dst$$XMMRegister, $constantaddress($con));
 2909   %}
 2910   ins_pipe(pipe_slow);
 2911 %}
 2912 
 2913 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2914   predicate(UseAVX > 0);
 2915   match(Set dst (AddF src1 src2));
 2916 
 2917   format %{ "vaddss  $dst, $src1, $src2" %}
 2918   ins_cost(150);
 2919   ins_encode %{
 2920     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2921   %}
 2922   ins_pipe(pipe_slow);
 2923 %}
 2924 
 2925 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2926   predicate(UseAVX > 0);
 2927   match(Set dst (AddF src1 (LoadF src2)));
 2928 
 2929   format %{ "vaddss  $dst, $src1, $src2" %}
 2930   ins_cost(150);
 2931   ins_encode %{
 2932     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2933   %}
 2934   ins_pipe(pipe_slow);
 2935 %}
 2936 
 2937 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2938   predicate(UseAVX > 0);
 2939   match(Set dst (AddF src con));
 2940 
 2941   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2942   ins_cost(150);
 2943   ins_encode %{
 2944     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2945   %}
 2946   ins_pipe(pipe_slow);
 2947 %}
 2948 
 2949 instruct addD_reg(regD dst, regD src) %{
 2950   predicate(UseAVX == 0);
 2951   match(Set dst (AddD dst src));
 2952 
 2953   format %{ "addsd   $dst, $src" %}
 2954   ins_cost(150);
 2955   ins_encode %{
 2956     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2957   %}
 2958   ins_pipe(pipe_slow);
 2959 %}
 2960 
 2961 instruct addD_mem(regD dst, memory src) %{
 2962   predicate(UseAVX == 0);
 2963   match(Set dst (AddD dst (LoadD src)));
 2964 
 2965   format %{ "addsd   $dst, $src" %}
 2966   ins_cost(150);
 2967   ins_encode %{
 2968     __ addsd($dst$$XMMRegister, $src$$Address);
 2969   %}
 2970   ins_pipe(pipe_slow);
 2971 %}
 2972 
 2973 instruct addD_imm(regD dst, immD con) %{
 2974   predicate(UseAVX == 0);
 2975   match(Set dst (AddD dst con));
 2976   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 2977   ins_cost(150);
 2978   ins_encode %{
 2979     __ addsd($dst$$XMMRegister, $constantaddress($con));
 2980   %}
 2981   ins_pipe(pipe_slow);
 2982 %}
 2983 
 2984 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 2985   predicate(UseAVX > 0);
 2986   match(Set dst (AddD src1 src2));
 2987 
 2988   format %{ "vaddsd  $dst, $src1, $src2" %}
 2989   ins_cost(150);
 2990   ins_encode %{
 2991     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2992   %}
 2993   ins_pipe(pipe_slow);
 2994 %}
 2995 
 2996 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 2997   predicate(UseAVX > 0);
 2998   match(Set dst (AddD src1 (LoadD src2)));
 2999 
 3000   format %{ "vaddsd  $dst, $src1, $src2" %}
 3001   ins_cost(150);
 3002   ins_encode %{
 3003     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3004   %}
 3005   ins_pipe(pipe_slow);
 3006 %}
 3007 
 3008 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3009   predicate(UseAVX > 0);
 3010   match(Set dst (AddD src con));
 3011 
 3012   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3013   ins_cost(150);
 3014   ins_encode %{
 3015     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3016   %}
 3017   ins_pipe(pipe_slow);
 3018 %}
 3019 
 3020 instruct subF_reg(regF dst, regF src) %{
 3021   predicate(UseAVX == 0);
 3022   match(Set dst (SubF dst src));
 3023 
 3024   format %{ "subss   $dst, $src" %}
 3025   ins_cost(150);
 3026   ins_encode %{
 3027     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3028   %}
 3029   ins_pipe(pipe_slow);
 3030 %}
 3031 
 3032 instruct subF_mem(regF dst, memory src) %{
 3033   predicate(UseAVX == 0);
 3034   match(Set dst (SubF dst (LoadF src)));
 3035 
 3036   format %{ "subss   $dst, $src" %}
 3037   ins_cost(150);
 3038   ins_encode %{
 3039     __ subss($dst$$XMMRegister, $src$$Address);
 3040   %}
 3041   ins_pipe(pipe_slow);
 3042 %}
 3043 
 3044 instruct subF_imm(regF dst, immF con) %{
 3045   predicate(UseAVX == 0);
 3046   match(Set dst (SubF dst con));
 3047   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3048   ins_cost(150);
 3049   ins_encode %{
 3050     __ subss($dst$$XMMRegister, $constantaddress($con));
 3051   %}
 3052   ins_pipe(pipe_slow);
 3053 %}
 3054 
 3055 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3056   predicate(UseAVX > 0);
 3057   match(Set dst (SubF src1 src2));
 3058 
 3059   format %{ "vsubss  $dst, $src1, $src2" %}
 3060   ins_cost(150);
 3061   ins_encode %{
 3062     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3063   %}
 3064   ins_pipe(pipe_slow);
 3065 %}
 3066 
 3067 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3068   predicate(UseAVX > 0);
 3069   match(Set dst (SubF src1 (LoadF src2)));
 3070 
 3071   format %{ "vsubss  $dst, $src1, $src2" %}
 3072   ins_cost(150);
 3073   ins_encode %{
 3074     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3075   %}
 3076   ins_pipe(pipe_slow);
 3077 %}
 3078 
 3079 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3080   predicate(UseAVX > 0);
 3081   match(Set dst (SubF src con));
 3082 
 3083   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3084   ins_cost(150);
 3085   ins_encode %{
 3086     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3087   %}
 3088   ins_pipe(pipe_slow);
 3089 %}
 3090 
 3091 instruct subD_reg(regD dst, regD src) %{
 3092   predicate(UseAVX == 0);
 3093   match(Set dst (SubD dst src));
 3094 
 3095   format %{ "subsd   $dst, $src" %}
 3096   ins_cost(150);
 3097   ins_encode %{
 3098     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3099   %}
 3100   ins_pipe(pipe_slow);
 3101 %}
 3102 
 3103 instruct subD_mem(regD dst, memory src) %{
 3104   predicate(UseAVX == 0);
 3105   match(Set dst (SubD dst (LoadD src)));
 3106 
 3107   format %{ "subsd   $dst, $src" %}
 3108   ins_cost(150);
 3109   ins_encode %{
 3110     __ subsd($dst$$XMMRegister, $src$$Address);
 3111   %}
 3112   ins_pipe(pipe_slow);
 3113 %}
 3114 
 3115 instruct subD_imm(regD dst, immD con) %{
 3116   predicate(UseAVX == 0);
 3117   match(Set dst (SubD dst con));
 3118   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3119   ins_cost(150);
 3120   ins_encode %{
 3121     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3122   %}
 3123   ins_pipe(pipe_slow);
 3124 %}
 3125 
 3126 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3127   predicate(UseAVX > 0);
 3128   match(Set dst (SubD src1 src2));
 3129 
 3130   format %{ "vsubsd  $dst, $src1, $src2" %}
 3131   ins_cost(150);
 3132   ins_encode %{
 3133     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3134   %}
 3135   ins_pipe(pipe_slow);
 3136 %}
 3137 
 3138 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3139   predicate(UseAVX > 0);
 3140   match(Set dst (SubD src1 (LoadD src2)));
 3141 
 3142   format %{ "vsubsd  $dst, $src1, $src2" %}
 3143   ins_cost(150);
 3144   ins_encode %{
 3145     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3146   %}
 3147   ins_pipe(pipe_slow);
 3148 %}
 3149 
 3150 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3151   predicate(UseAVX > 0);
 3152   match(Set dst (SubD src con));
 3153 
 3154   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3155   ins_cost(150);
 3156   ins_encode %{
 3157     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3158   %}
 3159   ins_pipe(pipe_slow);
 3160 %}
 3161 
 3162 instruct mulF_reg(regF dst, regF src) %{
 3163   predicate(UseAVX == 0);
 3164   match(Set dst (MulF dst src));
 3165 
 3166   format %{ "mulss   $dst, $src" %}
 3167   ins_cost(150);
 3168   ins_encode %{
 3169     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3170   %}
 3171   ins_pipe(pipe_slow);
 3172 %}
 3173 
 3174 instruct mulF_mem(regF dst, memory src) %{
 3175   predicate(UseAVX == 0);
 3176   match(Set dst (MulF dst (LoadF src)));
 3177 
 3178   format %{ "mulss   $dst, $src" %}
 3179   ins_cost(150);
 3180   ins_encode %{
 3181     __ mulss($dst$$XMMRegister, $src$$Address);
 3182   %}
 3183   ins_pipe(pipe_slow);
 3184 %}
 3185 
 3186 instruct mulF_imm(regF dst, immF con) %{
 3187   predicate(UseAVX == 0);
 3188   match(Set dst (MulF dst con));
 3189   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3190   ins_cost(150);
 3191   ins_encode %{
 3192     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3193   %}
 3194   ins_pipe(pipe_slow);
 3195 %}
 3196 
 3197 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3198   predicate(UseAVX > 0);
 3199   match(Set dst (MulF src1 src2));
 3200 
 3201   format %{ "vmulss  $dst, $src1, $src2" %}
 3202   ins_cost(150);
 3203   ins_encode %{
 3204     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3205   %}
 3206   ins_pipe(pipe_slow);
 3207 %}
 3208 
 3209 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3210   predicate(UseAVX > 0);
 3211   match(Set dst (MulF src1 (LoadF src2)));
 3212 
 3213   format %{ "vmulss  $dst, $src1, $src2" %}
 3214   ins_cost(150);
 3215   ins_encode %{
 3216     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3217   %}
 3218   ins_pipe(pipe_slow);
 3219 %}
 3220 
 3221 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3222   predicate(UseAVX > 0);
 3223   match(Set dst (MulF src con));
 3224 
 3225   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3226   ins_cost(150);
 3227   ins_encode %{
 3228     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3229   %}
 3230   ins_pipe(pipe_slow);
 3231 %}
 3232 
 3233 instruct mulD_reg(regD dst, regD src) %{
 3234   predicate(UseAVX == 0);
 3235   match(Set dst (MulD dst src));
 3236 
 3237   format %{ "mulsd   $dst, $src" %}
 3238   ins_cost(150);
 3239   ins_encode %{
 3240     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3241   %}
 3242   ins_pipe(pipe_slow);
 3243 %}
 3244 
 3245 instruct mulD_mem(regD dst, memory src) %{
 3246   predicate(UseAVX == 0);
 3247   match(Set dst (MulD dst (LoadD src)));
 3248 
 3249   format %{ "mulsd   $dst, $src" %}
 3250   ins_cost(150);
 3251   ins_encode %{
 3252     __ mulsd($dst$$XMMRegister, $src$$Address);
 3253   %}
 3254   ins_pipe(pipe_slow);
 3255 %}
 3256 
 3257 instruct mulD_imm(regD dst, immD con) %{
 3258   predicate(UseAVX == 0);
 3259   match(Set dst (MulD dst con));
 3260   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3261   ins_cost(150);
 3262   ins_encode %{
 3263     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3264   %}
 3265   ins_pipe(pipe_slow);
 3266 %}
 3267 
 3268 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3269   predicate(UseAVX > 0);
 3270   match(Set dst (MulD src1 src2));
 3271 
 3272   format %{ "vmulsd  $dst, $src1, $src2" %}
 3273   ins_cost(150);
 3274   ins_encode %{
 3275     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3276   %}
 3277   ins_pipe(pipe_slow);
 3278 %}
 3279 
 3280 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3281   predicate(UseAVX > 0);
 3282   match(Set dst (MulD src1 (LoadD src2)));
 3283 
 3284   format %{ "vmulsd  $dst, $src1, $src2" %}
 3285   ins_cost(150);
 3286   ins_encode %{
 3287     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3288   %}
 3289   ins_pipe(pipe_slow);
 3290 %}
 3291 
 3292 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3293   predicate(UseAVX > 0);
 3294   match(Set dst (MulD src con));
 3295 
 3296   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3297   ins_cost(150);
 3298   ins_encode %{
 3299     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3300   %}
 3301   ins_pipe(pipe_slow);
 3302 %}
 3303 
 3304 instruct divF_reg(regF dst, regF src) %{
 3305   predicate(UseAVX == 0);
 3306   match(Set dst (DivF dst src));
 3307 
 3308   format %{ "divss   $dst, $src" %}
 3309   ins_cost(150);
 3310   ins_encode %{
 3311     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3312   %}
 3313   ins_pipe(pipe_slow);
 3314 %}
 3315 
 3316 instruct divF_mem(regF dst, memory src) %{
 3317   predicate(UseAVX == 0);
 3318   match(Set dst (DivF dst (LoadF src)));
 3319 
 3320   format %{ "divss   $dst, $src" %}
 3321   ins_cost(150);
 3322   ins_encode %{
 3323     __ divss($dst$$XMMRegister, $src$$Address);
 3324   %}
 3325   ins_pipe(pipe_slow);
 3326 %}
 3327 
 3328 instruct divF_imm(regF dst, immF con) %{
 3329   predicate(UseAVX == 0);
 3330   match(Set dst (DivF dst con));
 3331   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3332   ins_cost(150);
 3333   ins_encode %{
 3334     __ divss($dst$$XMMRegister, $constantaddress($con));
 3335   %}
 3336   ins_pipe(pipe_slow);
 3337 %}
 3338 
 3339 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3340   predicate(UseAVX > 0);
 3341   match(Set dst (DivF src1 src2));
 3342 
 3343   format %{ "vdivss  $dst, $src1, $src2" %}
 3344   ins_cost(150);
 3345   ins_encode %{
 3346     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3347   %}
 3348   ins_pipe(pipe_slow);
 3349 %}
 3350 
 3351 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3352   predicate(UseAVX > 0);
 3353   match(Set dst (DivF src1 (LoadF src2)));
 3354 
 3355   format %{ "vdivss  $dst, $src1, $src2" %}
 3356   ins_cost(150);
 3357   ins_encode %{
 3358     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3359   %}
 3360   ins_pipe(pipe_slow);
 3361 %}
 3362 
 3363 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3364   predicate(UseAVX > 0);
 3365   match(Set dst (DivF src con));
 3366 
 3367   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3368   ins_cost(150);
 3369   ins_encode %{
 3370     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3371   %}
 3372   ins_pipe(pipe_slow);
 3373 %}
 3374 
 3375 instruct divD_reg(regD dst, regD src) %{
 3376   predicate(UseAVX == 0);
 3377   match(Set dst (DivD dst src));
 3378 
 3379   format %{ "divsd   $dst, $src" %}
 3380   ins_cost(150);
 3381   ins_encode %{
 3382     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3383   %}
 3384   ins_pipe(pipe_slow);
 3385 %}
 3386 
 3387 instruct divD_mem(regD dst, memory src) %{
 3388   predicate(UseAVX == 0);
 3389   match(Set dst (DivD dst (LoadD src)));
 3390 
 3391   format %{ "divsd   $dst, $src" %}
 3392   ins_cost(150);
 3393   ins_encode %{
 3394     __ divsd($dst$$XMMRegister, $src$$Address);
 3395   %}
 3396   ins_pipe(pipe_slow);
 3397 %}
 3398 
 3399 instruct divD_imm(regD dst, immD con) %{
 3400   predicate(UseAVX == 0);
 3401   match(Set dst (DivD dst con));
 3402   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3403   ins_cost(150);
 3404   ins_encode %{
 3405     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3406   %}
 3407   ins_pipe(pipe_slow);
 3408 %}
 3409 
 3410 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3411   predicate(UseAVX > 0);
 3412   match(Set dst (DivD src1 src2));
 3413 
 3414   format %{ "vdivsd  $dst, $src1, $src2" %}
 3415   ins_cost(150);
 3416   ins_encode %{
 3417     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3418   %}
 3419   ins_pipe(pipe_slow);
 3420 %}
 3421 
 3422 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3423   predicate(UseAVX > 0);
 3424   match(Set dst (DivD src1 (LoadD src2)));
 3425 
 3426   format %{ "vdivsd  $dst, $src1, $src2" %}
 3427   ins_cost(150);
 3428   ins_encode %{
 3429     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3430   %}
 3431   ins_pipe(pipe_slow);
 3432 %}
 3433 
 3434 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3435   predicate(UseAVX > 0);
 3436   match(Set dst (DivD src con));
 3437 
 3438   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3439   ins_cost(150);
 3440   ins_encode %{
 3441     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3442   %}
 3443   ins_pipe(pipe_slow);
 3444 %}
 3445 
 3446 instruct absF_reg(regF dst) %{
 3447   predicate(UseAVX == 0);
 3448   match(Set dst (AbsF dst));
 3449   ins_cost(150);
 3450   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3451   ins_encode %{
 3452     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3453   %}
 3454   ins_pipe(pipe_slow);
 3455 %}
 3456 
 3457 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3458   predicate(UseAVX > 0);
 3459   match(Set dst (AbsF src));
 3460   ins_cost(150);
 3461   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3462   ins_encode %{
 3463     int vlen_enc = Assembler::AVX_128bit;
 3464     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3465               ExternalAddress(float_signmask()), vlen_enc);
 3466   %}
 3467   ins_pipe(pipe_slow);
 3468 %}
 3469 
 3470 instruct absD_reg(regD dst) %{
 3471   predicate(UseAVX == 0);
 3472   match(Set dst (AbsD dst));
 3473   ins_cost(150);
 3474   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3475             "# abs double by sign masking" %}
 3476   ins_encode %{
 3477     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3478   %}
 3479   ins_pipe(pipe_slow);
 3480 %}
 3481 
 3482 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3483   predicate(UseAVX > 0);
 3484   match(Set dst (AbsD src));
 3485   ins_cost(150);
 3486   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3487             "# abs double by sign masking" %}
 3488   ins_encode %{
 3489     int vlen_enc = Assembler::AVX_128bit;
 3490     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3491               ExternalAddress(double_signmask()), vlen_enc);
 3492   %}
 3493   ins_pipe(pipe_slow);
 3494 %}
 3495 
 3496 instruct negF_reg(regF dst) %{
 3497   predicate(UseAVX == 0);
 3498   match(Set dst (NegF dst));
 3499   ins_cost(150);
 3500   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3501   ins_encode %{
 3502     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3503   %}
 3504   ins_pipe(pipe_slow);
 3505 %}
 3506 
 3507 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3508   predicate(UseAVX > 0);
 3509   match(Set dst (NegF src));
 3510   ins_cost(150);
 3511   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3512   ins_encode %{
 3513     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3514                  ExternalAddress(float_signflip()));
 3515   %}
 3516   ins_pipe(pipe_slow);
 3517 %}
 3518 
 3519 instruct negD_reg(regD dst) %{
 3520   predicate(UseAVX == 0);
 3521   match(Set dst (NegD dst));
 3522   ins_cost(150);
 3523   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3524             "# neg double by sign flipping" %}
 3525   ins_encode %{
 3526     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3527   %}
 3528   ins_pipe(pipe_slow);
 3529 %}
 3530 
 3531 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3532   predicate(UseAVX > 0);
 3533   match(Set dst (NegD src));
 3534   ins_cost(150);
 3535   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3536             "# neg double by sign flipping" %}
 3537   ins_encode %{
 3538     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3539                  ExternalAddress(double_signflip()));
 3540   %}
 3541   ins_pipe(pipe_slow);
 3542 %}
 3543 
 3544 // sqrtss instruction needs destination register to be pre initialized for best performance
 3545 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3546 instruct sqrtF_reg(regF dst) %{
 3547   match(Set dst (SqrtF dst));
 3548   format %{ "sqrtss  $dst, $dst" %}
 3549   ins_encode %{
 3550     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3551   %}
 3552   ins_pipe(pipe_slow);
 3553 %}
 3554 
 3555 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3556 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3557 instruct sqrtD_reg(regD dst) %{
 3558   match(Set dst (SqrtD dst));
 3559   format %{ "sqrtsd  $dst, $dst" %}
 3560   ins_encode %{
 3561     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3562   %}
 3563   ins_pipe(pipe_slow);
 3564 %}
 3565 
 3566 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3567   effect(TEMP tmp);
 3568   match(Set dst (ConvF2HF src));
 3569   ins_cost(125);
 3570   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3571   ins_encode %{
 3572     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3573   %}
 3574   ins_pipe( pipe_slow );
 3575 %}
 3576 
 3577 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3578   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3579   effect(TEMP ktmp, TEMP rtmp);
 3580   match(Set mem (StoreC mem (ConvF2HF src)));
 3581   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3582   ins_encode %{
 3583     __ movl($rtmp$$Register, 0x1);
 3584     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3585     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3586   %}
 3587   ins_pipe( pipe_slow );
 3588 %}
 3589 
 3590 instruct vconvF2HF(vec dst, vec src) %{
 3591   match(Set dst (VectorCastF2HF src));
 3592   format %{ "vector_conv_F2HF $dst $src" %}
 3593   ins_encode %{
 3594     int vlen_enc = vector_length_encoding(this, $src);
 3595     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3596   %}
 3597   ins_pipe( pipe_slow );
 3598 %}
 3599 
 3600 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3601   predicate(n->as_StoreVector()->memory_size() >= 16);
 3602   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3603   format %{ "vcvtps2ph $mem,$src" %}
 3604   ins_encode %{
 3605     int vlen_enc = vector_length_encoding(this, $src);
 3606     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3607   %}
 3608   ins_pipe( pipe_slow );
 3609 %}
 3610 
 3611 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3612   match(Set dst (ConvHF2F src));
 3613   format %{ "vcvtph2ps $dst,$src" %}
 3614   ins_encode %{
 3615     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3616   %}
 3617   ins_pipe( pipe_slow );
 3618 %}
 3619 
 3620 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3621   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3622   format %{ "vcvtph2ps $dst,$mem" %}
 3623   ins_encode %{
 3624     int vlen_enc = vector_length_encoding(this);
 3625     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3626   %}
 3627   ins_pipe( pipe_slow );
 3628 %}
 3629 
 3630 instruct vconvHF2F(vec dst, vec src) %{
 3631   match(Set dst (VectorCastHF2F src));
 3632   ins_cost(125);
 3633   format %{ "vector_conv_HF2F $dst,$src" %}
 3634   ins_encode %{
 3635     int vlen_enc = vector_length_encoding(this);
 3636     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3637   %}
 3638   ins_pipe( pipe_slow );
 3639 %}
 3640 
 3641 // ---------------------------------------- VectorReinterpret ------------------------------------
 3642 instruct reinterpret_mask(kReg dst) %{
 3643   predicate(n->bottom_type()->isa_vectmask() &&
 3644             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3645   match(Set dst (VectorReinterpret dst));
 3646   ins_cost(125);
 3647   format %{ "vector_reinterpret $dst\t!" %}
 3648   ins_encode %{
 3649     // empty
 3650   %}
 3651   ins_pipe( pipe_slow );
 3652 %}
 3653 
 3654 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3655   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3656             n->bottom_type()->isa_vectmask() &&
 3657             n->in(1)->bottom_type()->isa_vectmask() &&
 3658             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3659             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3660   match(Set dst (VectorReinterpret src));
 3661   effect(TEMP xtmp);
 3662   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3663   ins_encode %{
 3664      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3665      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3666      assert(src_sz == dst_sz , "src and dst size mismatch");
 3667      int vlen_enc = vector_length_encoding(src_sz);
 3668      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3669      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3670   %}
 3671   ins_pipe( pipe_slow );
 3672 %}
 3673 
 3674 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3675   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3676             n->bottom_type()->isa_vectmask() &&
 3677             n->in(1)->bottom_type()->isa_vectmask() &&
 3678             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3679              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3680             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3681   match(Set dst (VectorReinterpret src));
 3682   effect(TEMP xtmp);
 3683   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3684   ins_encode %{
 3685      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3686      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3687      assert(src_sz == dst_sz , "src and dst size mismatch");
 3688      int vlen_enc = vector_length_encoding(src_sz);
 3689      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3690      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3691   %}
 3692   ins_pipe( pipe_slow );
 3693 %}
 3694 
 3695 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3696   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3697             n->bottom_type()->isa_vectmask() &&
 3698             n->in(1)->bottom_type()->isa_vectmask() &&
 3699             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3700              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3701             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3702   match(Set dst (VectorReinterpret src));
 3703   effect(TEMP xtmp);
 3704   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3705   ins_encode %{
 3706      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3707      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3708      assert(src_sz == dst_sz , "src and dst size mismatch");
 3709      int vlen_enc = vector_length_encoding(src_sz);
 3710      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3711      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3712   %}
 3713   ins_pipe( pipe_slow );
 3714 %}
 3715 
 3716 instruct reinterpret(vec dst) %{
 3717   predicate(!n->bottom_type()->isa_vectmask() &&
 3718             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3719   match(Set dst (VectorReinterpret dst));
 3720   ins_cost(125);
 3721   format %{ "vector_reinterpret $dst\t!" %}
 3722   ins_encode %{
 3723     // empty
 3724   %}
 3725   ins_pipe( pipe_slow );
 3726 %}
 3727 
 3728 instruct reinterpret_expand(vec dst, vec src) %{
 3729   predicate(UseAVX == 0 &&
 3730             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3731   match(Set dst (VectorReinterpret src));
 3732   ins_cost(125);
 3733   effect(TEMP dst);
 3734   format %{ "vector_reinterpret_expand $dst,$src" %}
 3735   ins_encode %{
 3736     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3737     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3738 
 3739     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3740     if (src_vlen_in_bytes == 4) {
 3741       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3742     } else {
 3743       assert(src_vlen_in_bytes == 8, "");
 3744       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3745     }
 3746     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3747   %}
 3748   ins_pipe( pipe_slow );
 3749 %}
 3750 
 3751 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3752   predicate(UseAVX > 0 &&
 3753             !n->bottom_type()->isa_vectmask() &&
 3754             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3755             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3756   match(Set dst (VectorReinterpret src));
 3757   ins_cost(125);
 3758   format %{ "vector_reinterpret_expand $dst,$src" %}
 3759   ins_encode %{
 3760     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3761   %}
 3762   ins_pipe( pipe_slow );
 3763 %}
 3764 
 3765 
 3766 instruct vreinterpret_expand(legVec dst, vec src) %{
 3767   predicate(UseAVX > 0 &&
 3768             !n->bottom_type()->isa_vectmask() &&
 3769             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3770             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3771   match(Set dst (VectorReinterpret src));
 3772   ins_cost(125);
 3773   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3774   ins_encode %{
 3775     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3776       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3777       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3778       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3779       default: ShouldNotReachHere();
 3780     }
 3781   %}
 3782   ins_pipe( pipe_slow );
 3783 %}
 3784 
 3785 instruct reinterpret_shrink(vec dst, legVec src) %{
 3786   predicate(!n->bottom_type()->isa_vectmask() &&
 3787             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3788   match(Set dst (VectorReinterpret src));
 3789   ins_cost(125);
 3790   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3791   ins_encode %{
 3792     switch (Matcher::vector_length_in_bytes(this)) {
 3793       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3794       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3795       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3796       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3797       default: ShouldNotReachHere();
 3798     }
 3799   %}
 3800   ins_pipe( pipe_slow );
 3801 %}
 3802 
 3803 // ----------------------------------------------------------------------------------------------------
 3804 
 3805 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3806   match(Set dst (RoundDoubleMode src rmode));
 3807   format %{ "roundsd $dst,$src" %}
 3808   ins_cost(150);
 3809   ins_encode %{
 3810     assert(UseSSE >= 4, "required");
 3811     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3812       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3813     }
 3814     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3815   %}
 3816   ins_pipe(pipe_slow);
 3817 %}
 3818 
 3819 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3820   match(Set dst (RoundDoubleMode con rmode));
 3821   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3822   ins_cost(150);
 3823   ins_encode %{
 3824     assert(UseSSE >= 4, "required");
 3825     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3826   %}
 3827   ins_pipe(pipe_slow);
 3828 %}
 3829 
 3830 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3831   predicate(Matcher::vector_length(n) < 8);
 3832   match(Set dst (RoundDoubleModeV src rmode));
 3833   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3834   ins_encode %{
 3835     assert(UseAVX > 0, "required");
 3836     int vlen_enc = vector_length_encoding(this);
 3837     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3838   %}
 3839   ins_pipe( pipe_slow );
 3840 %}
 3841 
 3842 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3843   predicate(Matcher::vector_length(n) == 8);
 3844   match(Set dst (RoundDoubleModeV src rmode));
 3845   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3846   ins_encode %{
 3847     assert(UseAVX > 2, "required");
 3848     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3849   %}
 3850   ins_pipe( pipe_slow );
 3851 %}
 3852 
 3853 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3854   predicate(Matcher::vector_length(n) < 8);
 3855   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3856   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3857   ins_encode %{
 3858     assert(UseAVX > 0, "required");
 3859     int vlen_enc = vector_length_encoding(this);
 3860     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3861   %}
 3862   ins_pipe( pipe_slow );
 3863 %}
 3864 
 3865 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3866   predicate(Matcher::vector_length(n) == 8);
 3867   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3868   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3869   ins_encode %{
 3870     assert(UseAVX > 2, "required");
 3871     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3872   %}
 3873   ins_pipe( pipe_slow );
 3874 %}
 3875 
 3876 instruct onspinwait() %{
 3877   match(OnSpinWait);
 3878   ins_cost(200);
 3879 
 3880   format %{
 3881     $$template
 3882     $$emit$$"pause\t! membar_onspinwait"
 3883   %}
 3884   ins_encode %{
 3885     __ pause();
 3886   %}
 3887   ins_pipe(pipe_slow);
 3888 %}
 3889 
 3890 // a * b + c
 3891 instruct fmaD_reg(regD a, regD b, regD c) %{
 3892   match(Set c (FmaD  c (Binary a b)));
 3893   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3894   ins_cost(150);
 3895   ins_encode %{
 3896     assert(UseFMA, "Needs FMA instructions support.");
 3897     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3898   %}
 3899   ins_pipe( pipe_slow );
 3900 %}
 3901 
 3902 // a * b + c
 3903 instruct fmaF_reg(regF a, regF b, regF c) %{
 3904   match(Set c (FmaF  c (Binary a b)));
 3905   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3906   ins_cost(150);
 3907   ins_encode %{
 3908     assert(UseFMA, "Needs FMA instructions support.");
 3909     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3910   %}
 3911   ins_pipe( pipe_slow );
 3912 %}
 3913 
 3914 // ====================VECTOR INSTRUCTIONS=====================================
 3915 
 3916 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3917 instruct MoveVec2Leg(legVec dst, vec src) %{
 3918   match(Set dst src);
 3919   format %{ "" %}
 3920   ins_encode %{
 3921     ShouldNotReachHere();
 3922   %}
 3923   ins_pipe( fpu_reg_reg );
 3924 %}
 3925 
 3926 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3927   match(Set dst src);
 3928   format %{ "" %}
 3929   ins_encode %{
 3930     ShouldNotReachHere();
 3931   %}
 3932   ins_pipe( fpu_reg_reg );
 3933 %}
 3934 
 3935 // ============================================================================
 3936 
 3937 // Load vectors generic operand pattern
 3938 instruct loadV(vec dst, memory mem) %{
 3939   match(Set dst (LoadVector mem));
 3940   ins_cost(125);
 3941   format %{ "load_vector $dst,$mem" %}
 3942   ins_encode %{
 3943     BasicType bt = Matcher::vector_element_basic_type(this);
 3944     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3945   %}
 3946   ins_pipe( pipe_slow );
 3947 %}
 3948 
 3949 // Store vectors generic operand pattern.
 3950 instruct storeV(memory mem, vec src) %{
 3951   match(Set mem (StoreVector mem src));
 3952   ins_cost(145);
 3953   format %{ "store_vector $mem,$src\n\t" %}
 3954   ins_encode %{
 3955     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3956       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3957       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3958       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3959       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3960       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3961       default: ShouldNotReachHere();
 3962     }
 3963   %}
 3964   ins_pipe( pipe_slow );
 3965 %}
 3966 
 3967 // ---------------------------------------- Gather ------------------------------------
 3968 
 3969 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 3970 
 3971 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 3972   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 3973             Matcher::vector_length_in_bytes(n) <= 32);
 3974   match(Set dst (LoadVectorGather mem idx));
 3975   effect(TEMP dst, TEMP tmp, TEMP mask);
 3976   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 3977   ins_encode %{
 3978     int vlen_enc = vector_length_encoding(this);
 3979     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3980     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 3981     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3982     __ lea($tmp$$Register, $mem$$Address);
 3983     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3984   %}
 3985   ins_pipe( pipe_slow );
 3986 %}
 3987 
 3988 
 3989 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 3990   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 3991             !is_subword_type(Matcher::vector_element_basic_type(n)));
 3992   match(Set dst (LoadVectorGather mem idx));
 3993   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 3994   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 3995   ins_encode %{
 3996     int vlen_enc = vector_length_encoding(this);
 3997     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3998     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 3999     __ lea($tmp$$Register, $mem$$Address);
 4000     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4001   %}
 4002   ins_pipe( pipe_slow );
 4003 %}
 4004 
 4005 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4006   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4007             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4008   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4009   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4010   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4011   ins_encode %{
 4012     assert(UseAVX > 2, "sanity");
 4013     int vlen_enc = vector_length_encoding(this);
 4014     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4015     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4016     // Note: Since gather instruction partially updates the opmask register used
 4017     // for predication hense moving mask operand to a temporary.
 4018     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4019     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4020     __ lea($tmp$$Register, $mem$$Address);
 4021     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4022   %}
 4023   ins_pipe( pipe_slow );
 4024 %}
 4025 
 4026 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4027   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4028   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4029   effect(TEMP tmp, TEMP rtmp);
 4030   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4031   ins_encode %{
 4032     int vlen_enc = vector_length_encoding(this);
 4033     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4034     __ lea($tmp$$Register, $mem$$Address);
 4035     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4036   %}
 4037   ins_pipe( pipe_slow );
 4038 %}
 4039 
 4040 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4041                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4042   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4043   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4044   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4045   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4046   ins_encode %{
 4047     int vlen_enc = vector_length_encoding(this);
 4048     int vector_len = Matcher::vector_length(this);
 4049     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4050     __ lea($tmp$$Register, $mem$$Address);
 4051     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4052     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4053                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4054   %}
 4055   ins_pipe( pipe_slow );
 4056 %}
 4057 
 4058 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4059   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4060   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4061   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4062   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4063   ins_encode %{
 4064     int vlen_enc = vector_length_encoding(this);
 4065     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4066     __ lea($tmp$$Register, $mem$$Address);
 4067     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4068   %}
 4069   ins_pipe( pipe_slow );
 4070 %}
 4071 
 4072 
 4073 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4074                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4075   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4076   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4077   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4078   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4079   ins_encode %{
 4080     int vlen_enc = vector_length_encoding(this);
 4081     int vector_len = Matcher::vector_length(this);
 4082     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4083     __ lea($tmp$$Register, $mem$$Address);
 4084     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4085     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4086                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4087   %}
 4088   ins_pipe( pipe_slow );
 4089 %}
 4090 
 4091 
 4092 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4093   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4094   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4095   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4096   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4097   ins_encode %{
 4098     int vlen_enc = vector_length_encoding(this);
 4099     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4100     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4101     __ lea($tmp$$Register, $mem$$Address);
 4102     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4103     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4104   %}
 4105   ins_pipe( pipe_slow );
 4106 %}
 4107 
 4108 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4109                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4110   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4111   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4112   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4113   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4114   ins_encode %{
 4115     int vlen_enc = vector_length_encoding(this);
 4116     int vector_len = Matcher::vector_length(this);
 4117     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4118     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4119     __ lea($tmp$$Register, $mem$$Address);
 4120     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4121     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4122     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4123                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4124   %}
 4125   ins_pipe( pipe_slow );
 4126 %}
 4127 
 4128 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4129   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4130   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4131   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4132   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4133   ins_encode %{
 4134     int vlen_enc = vector_length_encoding(this);
 4135     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4136     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4137     __ lea($tmp$$Register, $mem$$Address);
 4138     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4139     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4140                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4141   %}
 4142   ins_pipe( pipe_slow );
 4143 %}
 4144 
 4145 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4146                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4147   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4148   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4149   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4150   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4151   ins_encode %{
 4152     int vlen_enc = vector_length_encoding(this);
 4153     int vector_len = Matcher::vector_length(this);
 4154     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4155     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4156     __ lea($tmp$$Register, $mem$$Address);
 4157     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4158     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4159     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4160                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4161   %}
 4162   ins_pipe( pipe_slow );
 4163 %}
 4164 
 4165 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4166   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4167   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4168   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4169   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4170   ins_encode %{
 4171     int vlen_enc = vector_length_encoding(this);
 4172     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4173     __ lea($tmp$$Register, $mem$$Address);
 4174     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4175     if (elem_bt == T_SHORT) {
 4176       __ movl($mask_idx$$Register, 0x55555555);
 4177       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4178     }
 4179     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4180     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4181   %}
 4182   ins_pipe( pipe_slow );
 4183 %}
 4184 
 4185 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4186                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4187   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4188   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4189   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4190   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4191   ins_encode %{
 4192     int vlen_enc = vector_length_encoding(this);
 4193     int vector_len = Matcher::vector_length(this);
 4194     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4195     __ lea($tmp$$Register, $mem$$Address);
 4196     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4197     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4198     if (elem_bt == T_SHORT) {
 4199       __ movl($mask_idx$$Register, 0x55555555);
 4200       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4201     }
 4202     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4203     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4204                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4205   %}
 4206   ins_pipe( pipe_slow );
 4207 %}
 4208 
 4209 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4210   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4211   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4212   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4213   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4214   ins_encode %{
 4215     int vlen_enc = vector_length_encoding(this);
 4216     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4217     __ lea($tmp$$Register, $mem$$Address);
 4218     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4219     if (elem_bt == T_SHORT) {
 4220       __ movl($mask_idx$$Register, 0x55555555);
 4221       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4222     }
 4223     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4224     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4225                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4226   %}
 4227   ins_pipe( pipe_slow );
 4228 %}
 4229 
 4230 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4231                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4232   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4233   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4234   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4235   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4236   ins_encode %{
 4237     int vlen_enc = vector_length_encoding(this);
 4238     int vector_len = Matcher::vector_length(this);
 4239     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4240     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4241     __ lea($tmp$$Register, $mem$$Address);
 4242     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4243     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4244     if (elem_bt == T_SHORT) {
 4245       __ movl($mask_idx$$Register, 0x55555555);
 4246       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4247     }
 4248     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4249     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4250                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4251   %}
 4252   ins_pipe( pipe_slow );
 4253 %}
 4254 
 4255 // ====================Scatter=======================================
 4256 
 4257 // Scatter INT, LONG, FLOAT, DOUBLE
 4258 
 4259 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4260   predicate(UseAVX > 2);
 4261   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4262   effect(TEMP tmp, TEMP ktmp);
 4263   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4264   ins_encode %{
 4265     int vlen_enc = vector_length_encoding(this, $src);
 4266     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4267 
 4268     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4269     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4270 
 4271     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4272     __ lea($tmp$$Register, $mem$$Address);
 4273     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4274   %}
 4275   ins_pipe( pipe_slow );
 4276 %}
 4277 
 4278 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4279   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4280   effect(TEMP tmp, TEMP ktmp);
 4281   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4282   ins_encode %{
 4283     int vlen_enc = vector_length_encoding(this, $src);
 4284     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4285     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4286     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4287     // Note: Since scatter instruction partially updates the opmask register used
 4288     // for predication hense moving mask operand to a temporary.
 4289     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4290     __ lea($tmp$$Register, $mem$$Address);
 4291     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4292   %}
 4293   ins_pipe( pipe_slow );
 4294 %}
 4295 
 4296 // ====================REPLICATE=======================================
 4297 
 4298 // Replicate byte scalar to be vector
 4299 instruct vReplB_reg(vec dst, rRegI src) %{
 4300   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4301   match(Set dst (Replicate src));
 4302   format %{ "replicateB $dst,$src" %}
 4303   ins_encode %{
 4304     uint vlen = Matcher::vector_length(this);
 4305     if (UseAVX >= 2) {
 4306       int vlen_enc = vector_length_encoding(this);
 4307       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4308         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4309         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4310       } else {
 4311         __ movdl($dst$$XMMRegister, $src$$Register);
 4312         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4313       }
 4314     } else {
 4315        assert(UseAVX < 2, "");
 4316       __ movdl($dst$$XMMRegister, $src$$Register);
 4317       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4318       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4319       if (vlen >= 16) {
 4320         assert(vlen == 16, "");
 4321         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4322       }
 4323     }
 4324   %}
 4325   ins_pipe( pipe_slow );
 4326 %}
 4327 
 4328 instruct ReplB_mem(vec dst, memory mem) %{
 4329   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4330   match(Set dst (Replicate (LoadB mem)));
 4331   format %{ "replicateB $dst,$mem" %}
 4332   ins_encode %{
 4333     int vlen_enc = vector_length_encoding(this);
 4334     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4335   %}
 4336   ins_pipe( pipe_slow );
 4337 %}
 4338 
 4339 // ====================ReplicateS=======================================
 4340 
 4341 instruct vReplS_reg(vec dst, rRegI src) %{
 4342   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4343   match(Set dst (Replicate src));
 4344   format %{ "replicateS $dst,$src" %}
 4345   ins_encode %{
 4346     uint vlen = Matcher::vector_length(this);
 4347     int vlen_enc = vector_length_encoding(this);
 4348     if (UseAVX >= 2) {
 4349       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4350         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4351         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4352       } else {
 4353         __ movdl($dst$$XMMRegister, $src$$Register);
 4354         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4355       }
 4356     } else {
 4357       assert(UseAVX < 2, "");
 4358       __ movdl($dst$$XMMRegister, $src$$Register);
 4359       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4360       if (vlen >= 8) {
 4361         assert(vlen == 8, "");
 4362         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4363       }
 4364     }
 4365   %}
 4366   ins_pipe( pipe_slow );
 4367 %}
 4368 
 4369 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4370   match(Set dst (Replicate con));
 4371   effect(TEMP rtmp);
 4372   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4373   ins_encode %{
 4374     int vlen_enc = vector_length_encoding(this);
 4375     BasicType bt = Matcher::vector_element_basic_type(this);
 4376     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4377     __ movl($rtmp$$Register, $con$$constant);
 4378     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4379   %}
 4380   ins_pipe( pipe_slow );
 4381 %}
 4382 
 4383 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4384   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4385   match(Set dst (Replicate src));
 4386   effect(TEMP rtmp);
 4387   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4388   ins_encode %{
 4389     int vlen_enc = vector_length_encoding(this);
 4390     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4391     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4392   %}
 4393   ins_pipe( pipe_slow );
 4394 %}
 4395 
 4396 instruct ReplS_mem(vec dst, memory mem) %{
 4397   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4398   match(Set dst (Replicate (LoadS mem)));
 4399   format %{ "replicateS $dst,$mem" %}
 4400   ins_encode %{
 4401     int vlen_enc = vector_length_encoding(this);
 4402     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4403   %}
 4404   ins_pipe( pipe_slow );
 4405 %}
 4406 
 4407 // ====================ReplicateI=======================================
 4408 
 4409 instruct ReplI_reg(vec dst, rRegI src) %{
 4410   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4411   match(Set dst (Replicate src));
 4412   format %{ "replicateI $dst,$src" %}
 4413   ins_encode %{
 4414     uint vlen = Matcher::vector_length(this);
 4415     int vlen_enc = vector_length_encoding(this);
 4416     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4417       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4418     } else if (VM_Version::supports_avx2()) {
 4419       __ movdl($dst$$XMMRegister, $src$$Register);
 4420       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4421     } else {
 4422       __ movdl($dst$$XMMRegister, $src$$Register);
 4423       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4424     }
 4425   %}
 4426   ins_pipe( pipe_slow );
 4427 %}
 4428 
 4429 instruct ReplI_mem(vec dst, memory mem) %{
 4430   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4431   match(Set dst (Replicate (LoadI mem)));
 4432   format %{ "replicateI $dst,$mem" %}
 4433   ins_encode %{
 4434     int vlen_enc = vector_length_encoding(this);
 4435     if (VM_Version::supports_avx2()) {
 4436       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4437     } else if (VM_Version::supports_avx()) {
 4438       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4439     } else {
 4440       __ movdl($dst$$XMMRegister, $mem$$Address);
 4441       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4442     }
 4443   %}
 4444   ins_pipe( pipe_slow );
 4445 %}
 4446 
 4447 instruct ReplI_imm(vec dst, immI con) %{
 4448   predicate(Matcher::is_non_long_integral_vector(n));
 4449   match(Set dst (Replicate con));
 4450   format %{ "replicateI $dst,$con" %}
 4451   ins_encode %{
 4452     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4453                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4454                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4455     BasicType bt = Matcher::vector_element_basic_type(this);
 4456     int vlen = Matcher::vector_length_in_bytes(this);
 4457     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4458   %}
 4459   ins_pipe( pipe_slow );
 4460 %}
 4461 
 4462 // Replicate scalar zero to be vector
 4463 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4464   predicate(Matcher::is_non_long_integral_vector(n));
 4465   match(Set dst (Replicate zero));
 4466   format %{ "replicateI $dst,$zero" %}
 4467   ins_encode %{
 4468     int vlen_enc = vector_length_encoding(this);
 4469     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4470       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4471     } else {
 4472       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4473     }
 4474   %}
 4475   ins_pipe( fpu_reg_reg );
 4476 %}
 4477 
 4478 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4479   predicate(Matcher::is_non_long_integral_vector(n));
 4480   match(Set dst (Replicate con));
 4481   format %{ "vallones $dst" %}
 4482   ins_encode %{
 4483     int vector_len = vector_length_encoding(this);
 4484     __ vallones($dst$$XMMRegister, vector_len);
 4485   %}
 4486   ins_pipe( pipe_slow );
 4487 %}
 4488 
 4489 // ====================ReplicateL=======================================
 4490 
 4491 // Replicate long (8 byte) scalar to be vector
 4492 instruct ReplL_reg(vec dst, rRegL src) %{
 4493   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4494   match(Set dst (Replicate src));
 4495   format %{ "replicateL $dst,$src" %}
 4496   ins_encode %{
 4497     int vlen = Matcher::vector_length(this);
 4498     int vlen_enc = vector_length_encoding(this);
 4499     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4500       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4501     } else if (VM_Version::supports_avx2()) {
 4502       __ movdq($dst$$XMMRegister, $src$$Register);
 4503       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4504     } else {
 4505       __ movdq($dst$$XMMRegister, $src$$Register);
 4506       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4507     }
 4508   %}
 4509   ins_pipe( pipe_slow );
 4510 %}
 4511 
 4512 instruct ReplL_mem(vec dst, memory mem) %{
 4513   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4514   match(Set dst (Replicate (LoadL mem)));
 4515   format %{ "replicateL $dst,$mem" %}
 4516   ins_encode %{
 4517     int vlen_enc = vector_length_encoding(this);
 4518     if (VM_Version::supports_avx2()) {
 4519       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4520     } else if (VM_Version::supports_sse3()) {
 4521       __ movddup($dst$$XMMRegister, $mem$$Address);
 4522     } else {
 4523       __ movq($dst$$XMMRegister, $mem$$Address);
 4524       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4525     }
 4526   %}
 4527   ins_pipe( pipe_slow );
 4528 %}
 4529 
 4530 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4531 instruct ReplL_imm(vec dst, immL con) %{
 4532   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4533   match(Set dst (Replicate con));
 4534   format %{ "replicateL $dst,$con" %}
 4535   ins_encode %{
 4536     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4537     int vlen = Matcher::vector_length_in_bytes(this);
 4538     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4539   %}
 4540   ins_pipe( pipe_slow );
 4541 %}
 4542 
 4543 instruct ReplL_zero(vec dst, immL0 zero) %{
 4544   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4545   match(Set dst (Replicate zero));
 4546   format %{ "replicateL $dst,$zero" %}
 4547   ins_encode %{
 4548     int vlen_enc = vector_length_encoding(this);
 4549     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4550       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4551     } else {
 4552       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4553     }
 4554   %}
 4555   ins_pipe( fpu_reg_reg );
 4556 %}
 4557 
 4558 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4559   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4560   match(Set dst (Replicate con));
 4561   format %{ "vallones $dst" %}
 4562   ins_encode %{
 4563     int vector_len = vector_length_encoding(this);
 4564     __ vallones($dst$$XMMRegister, vector_len);
 4565   %}
 4566   ins_pipe( pipe_slow );
 4567 %}
 4568 
 4569 // ====================ReplicateF=======================================
 4570 
 4571 instruct vReplF_reg(vec dst, vlRegF src) %{
 4572   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4573   match(Set dst (Replicate src));
 4574   format %{ "replicateF $dst,$src" %}
 4575   ins_encode %{
 4576     uint vlen = Matcher::vector_length(this);
 4577     int vlen_enc = vector_length_encoding(this);
 4578     if (vlen <= 4) {
 4579       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4580     } else if (VM_Version::supports_avx2()) {
 4581       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4582     } else {
 4583       assert(vlen == 8, "sanity");
 4584       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4585       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4586     }
 4587   %}
 4588   ins_pipe( pipe_slow );
 4589 %}
 4590 
 4591 instruct ReplF_reg(vec dst, vlRegF src) %{
 4592   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4593   match(Set dst (Replicate src));
 4594   format %{ "replicateF $dst,$src" %}
 4595   ins_encode %{
 4596     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4597   %}
 4598   ins_pipe( pipe_slow );
 4599 %}
 4600 
 4601 instruct ReplF_mem(vec dst, memory mem) %{
 4602   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4603   match(Set dst (Replicate (LoadF mem)));
 4604   format %{ "replicateF $dst,$mem" %}
 4605   ins_encode %{
 4606     int vlen_enc = vector_length_encoding(this);
 4607     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4608   %}
 4609   ins_pipe( pipe_slow );
 4610 %}
 4611 
 4612 // Replicate float scalar immediate to be vector by loading from const table.
 4613 instruct ReplF_imm(vec dst, immF con) %{
 4614   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4615   match(Set dst (Replicate con));
 4616   format %{ "replicateF $dst,$con" %}
 4617   ins_encode %{
 4618     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4619                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4620     int vlen = Matcher::vector_length_in_bytes(this);
 4621     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4622   %}
 4623   ins_pipe( pipe_slow );
 4624 %}
 4625 
 4626 instruct ReplF_zero(vec dst, immF0 zero) %{
 4627   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4628   match(Set dst (Replicate zero));
 4629   format %{ "replicateF $dst,$zero" %}
 4630   ins_encode %{
 4631     int vlen_enc = vector_length_encoding(this);
 4632     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4633       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4634     } else {
 4635       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4636     }
 4637   %}
 4638   ins_pipe( fpu_reg_reg );
 4639 %}
 4640 
 4641 // ====================ReplicateD=======================================
 4642 
 4643 // Replicate double (8 bytes) scalar to be vector
 4644 instruct vReplD_reg(vec dst, vlRegD src) %{
 4645   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4646   match(Set dst (Replicate src));
 4647   format %{ "replicateD $dst,$src" %}
 4648   ins_encode %{
 4649     uint vlen = Matcher::vector_length(this);
 4650     int vlen_enc = vector_length_encoding(this);
 4651     if (vlen <= 2) {
 4652       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4653     } else if (VM_Version::supports_avx2()) {
 4654       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4655     } else {
 4656       assert(vlen == 4, "sanity");
 4657       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4658       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4659     }
 4660   %}
 4661   ins_pipe( pipe_slow );
 4662 %}
 4663 
 4664 instruct ReplD_reg(vec dst, vlRegD src) %{
 4665   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4666   match(Set dst (Replicate src));
 4667   format %{ "replicateD $dst,$src" %}
 4668   ins_encode %{
 4669     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4670   %}
 4671   ins_pipe( pipe_slow );
 4672 %}
 4673 
 4674 instruct ReplD_mem(vec dst, memory mem) %{
 4675   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4676   match(Set dst (Replicate (LoadD mem)));
 4677   format %{ "replicateD $dst,$mem" %}
 4678   ins_encode %{
 4679     if (Matcher::vector_length(this) >= 4) {
 4680       int vlen_enc = vector_length_encoding(this);
 4681       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4682     } else {
 4683       __ movddup($dst$$XMMRegister, $mem$$Address);
 4684     }
 4685   %}
 4686   ins_pipe( pipe_slow );
 4687 %}
 4688 
 4689 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4690 instruct ReplD_imm(vec dst, immD con) %{
 4691   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4692   match(Set dst (Replicate con));
 4693   format %{ "replicateD $dst,$con" %}
 4694   ins_encode %{
 4695     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4696     int vlen = Matcher::vector_length_in_bytes(this);
 4697     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4698   %}
 4699   ins_pipe( pipe_slow );
 4700 %}
 4701 
 4702 instruct ReplD_zero(vec dst, immD0 zero) %{
 4703   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4704   match(Set dst (Replicate zero));
 4705   format %{ "replicateD $dst,$zero" %}
 4706   ins_encode %{
 4707     int vlen_enc = vector_length_encoding(this);
 4708     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4709       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4710     } else {
 4711       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4712     }
 4713   %}
 4714   ins_pipe( fpu_reg_reg );
 4715 %}
 4716 
 4717 // ====================VECTOR INSERT=======================================
 4718 
 4719 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4720   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4721   match(Set dst (VectorInsert (Binary dst val) idx));
 4722   format %{ "vector_insert $dst,$val,$idx" %}
 4723   ins_encode %{
 4724     assert(UseSSE >= 4, "required");
 4725     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4726 
 4727     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4728 
 4729     assert(is_integral_type(elem_bt), "");
 4730     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4731 
 4732     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4733   %}
 4734   ins_pipe( pipe_slow );
 4735 %}
 4736 
 4737 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4738   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4739   match(Set dst (VectorInsert (Binary src val) idx));
 4740   effect(TEMP vtmp);
 4741   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4742   ins_encode %{
 4743     int vlen_enc = Assembler::AVX_256bit;
 4744     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4745     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4746     int log2epr = log2(elem_per_lane);
 4747 
 4748     assert(is_integral_type(elem_bt), "sanity");
 4749     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4750 
 4751     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4752     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4753     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4754     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4755     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4756   %}
 4757   ins_pipe( pipe_slow );
 4758 %}
 4759 
 4760 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4761   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4762   match(Set dst (VectorInsert (Binary src val) idx));
 4763   effect(TEMP vtmp);
 4764   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4765   ins_encode %{
 4766     assert(UseAVX > 2, "sanity");
 4767 
 4768     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4769     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4770     int log2epr = log2(elem_per_lane);
 4771 
 4772     assert(is_integral_type(elem_bt), "");
 4773     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4774 
 4775     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4776     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4777     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4778     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4779     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4780   %}
 4781   ins_pipe( pipe_slow );
 4782 %}
 4783 
 4784 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4785   predicate(Matcher::vector_length(n) == 2);
 4786   match(Set dst (VectorInsert (Binary dst val) idx));
 4787   format %{ "vector_insert $dst,$val,$idx" %}
 4788   ins_encode %{
 4789     assert(UseSSE >= 4, "required");
 4790     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4791     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4792 
 4793     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4794   %}
 4795   ins_pipe( pipe_slow );
 4796 %}
 4797 
 4798 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4799   predicate(Matcher::vector_length(n) == 4);
 4800   match(Set dst (VectorInsert (Binary src val) idx));
 4801   effect(TEMP vtmp);
 4802   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4803   ins_encode %{
 4804     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4805     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4806 
 4807     uint x_idx = $idx$$constant & right_n_bits(1);
 4808     uint y_idx = ($idx$$constant >> 1) & 1;
 4809     int vlen_enc = Assembler::AVX_256bit;
 4810     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4811     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4812     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4813   %}
 4814   ins_pipe( pipe_slow );
 4815 %}
 4816 
 4817 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4818   predicate(Matcher::vector_length(n) == 8);
 4819   match(Set dst (VectorInsert (Binary src val) idx));
 4820   effect(TEMP vtmp);
 4821   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4822   ins_encode %{
 4823     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4824     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4825 
 4826     uint x_idx = $idx$$constant & right_n_bits(1);
 4827     uint y_idx = ($idx$$constant >> 1) & 3;
 4828     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4829     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4830     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4831   %}
 4832   ins_pipe( pipe_slow );
 4833 %}
 4834 
 4835 instruct insertF(vec dst, regF val, immU8 idx) %{
 4836   predicate(Matcher::vector_length(n) < 8);
 4837   match(Set dst (VectorInsert (Binary dst val) idx));
 4838   format %{ "vector_insert $dst,$val,$idx" %}
 4839   ins_encode %{
 4840     assert(UseSSE >= 4, "sanity");
 4841 
 4842     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4843     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4844 
 4845     uint x_idx = $idx$$constant & right_n_bits(2);
 4846     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4847   %}
 4848   ins_pipe( pipe_slow );
 4849 %}
 4850 
 4851 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4852   predicate(Matcher::vector_length(n) >= 8);
 4853   match(Set dst (VectorInsert (Binary src val) idx));
 4854   effect(TEMP vtmp);
 4855   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4856   ins_encode %{
 4857     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4858     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4859 
 4860     int vlen = Matcher::vector_length(this);
 4861     uint x_idx = $idx$$constant & right_n_bits(2);
 4862     if (vlen == 8) {
 4863       uint y_idx = ($idx$$constant >> 2) & 1;
 4864       int vlen_enc = Assembler::AVX_256bit;
 4865       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4866       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4867       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4868     } else {
 4869       assert(vlen == 16, "sanity");
 4870       uint y_idx = ($idx$$constant >> 2) & 3;
 4871       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4872       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4873       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4874     }
 4875   %}
 4876   ins_pipe( pipe_slow );
 4877 %}
 4878 
 4879 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4880   predicate(Matcher::vector_length(n) == 2);
 4881   match(Set dst (VectorInsert (Binary dst val) idx));
 4882   effect(TEMP tmp);
 4883   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4884   ins_encode %{
 4885     assert(UseSSE >= 4, "sanity");
 4886     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4887     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4888 
 4889     __ movq($tmp$$Register, $val$$XMMRegister);
 4890     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4891   %}
 4892   ins_pipe( pipe_slow );
 4893 %}
 4894 
 4895 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4896   predicate(Matcher::vector_length(n) == 4);
 4897   match(Set dst (VectorInsert (Binary src val) idx));
 4898   effect(TEMP vtmp, TEMP tmp);
 4899   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4900   ins_encode %{
 4901     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4902     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4903 
 4904     uint x_idx = $idx$$constant & right_n_bits(1);
 4905     uint y_idx = ($idx$$constant >> 1) & 1;
 4906     int vlen_enc = Assembler::AVX_256bit;
 4907     __ movq($tmp$$Register, $val$$XMMRegister);
 4908     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4909     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4910     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4911   %}
 4912   ins_pipe( pipe_slow );
 4913 %}
 4914 
 4915 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4916   predicate(Matcher::vector_length(n) == 8);
 4917   match(Set dst (VectorInsert (Binary src val) idx));
 4918   effect(TEMP tmp, TEMP vtmp);
 4919   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4920   ins_encode %{
 4921     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4922     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4923 
 4924     uint x_idx = $idx$$constant & right_n_bits(1);
 4925     uint y_idx = ($idx$$constant >> 1) & 3;
 4926     __ movq($tmp$$Register, $val$$XMMRegister);
 4927     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4928     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4929     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4930   %}
 4931   ins_pipe( pipe_slow );
 4932 %}
 4933 
 4934 // ====================REDUCTION ARITHMETIC=======================================
 4935 
 4936 // =======================Int Reduction==========================================
 4937 
 4938 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4939   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4940   match(Set dst (AddReductionVI src1 src2));
 4941   match(Set dst (MulReductionVI src1 src2));
 4942   match(Set dst (AndReductionV  src1 src2));
 4943   match(Set dst ( OrReductionV  src1 src2));
 4944   match(Set dst (XorReductionV  src1 src2));
 4945   match(Set dst (MinReductionV  src1 src2));
 4946   match(Set dst (MaxReductionV  src1 src2));
 4947   effect(TEMP vtmp1, TEMP vtmp2);
 4948   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4949   ins_encode %{
 4950     int opcode = this->ideal_Opcode();
 4951     int vlen = Matcher::vector_length(this, $src2);
 4952     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4953   %}
 4954   ins_pipe( pipe_slow );
 4955 %}
 4956 
 4957 // =======================Long Reduction==========================================
 4958 
 4959 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4960   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4961   match(Set dst (AddReductionVL src1 src2));
 4962   match(Set dst (MulReductionVL src1 src2));
 4963   match(Set dst (AndReductionV  src1 src2));
 4964   match(Set dst ( OrReductionV  src1 src2));
 4965   match(Set dst (XorReductionV  src1 src2));
 4966   match(Set dst (MinReductionV  src1 src2));
 4967   match(Set dst (MaxReductionV  src1 src2));
 4968   effect(TEMP vtmp1, TEMP vtmp2);
 4969   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4970   ins_encode %{
 4971     int opcode = this->ideal_Opcode();
 4972     int vlen = Matcher::vector_length(this, $src2);
 4973     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4974   %}
 4975   ins_pipe( pipe_slow );
 4976 %}
 4977 
 4978 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4979   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4980   match(Set dst (AddReductionVL src1 src2));
 4981   match(Set dst (MulReductionVL src1 src2));
 4982   match(Set dst (AndReductionV  src1 src2));
 4983   match(Set dst ( OrReductionV  src1 src2));
 4984   match(Set dst (XorReductionV  src1 src2));
 4985   match(Set dst (MinReductionV  src1 src2));
 4986   match(Set dst (MaxReductionV  src1 src2));
 4987   effect(TEMP vtmp1, TEMP vtmp2);
 4988   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4989   ins_encode %{
 4990     int opcode = this->ideal_Opcode();
 4991     int vlen = Matcher::vector_length(this, $src2);
 4992     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4993   %}
 4994   ins_pipe( pipe_slow );
 4995 %}
 4996 
 4997 // =======================Float Reduction==========================================
 4998 
 4999 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5000   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5001   match(Set dst (AddReductionVF dst src));
 5002   match(Set dst (MulReductionVF dst src));
 5003   effect(TEMP dst, TEMP vtmp);
 5004   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5005   ins_encode %{
 5006     int opcode = this->ideal_Opcode();
 5007     int vlen = Matcher::vector_length(this, $src);
 5008     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5009   %}
 5010   ins_pipe( pipe_slow );
 5011 %}
 5012 
 5013 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5014   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5015   match(Set dst (AddReductionVF dst src));
 5016   match(Set dst (MulReductionVF dst src));
 5017   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5018   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5019   ins_encode %{
 5020     int opcode = this->ideal_Opcode();
 5021     int vlen = Matcher::vector_length(this, $src);
 5022     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5023   %}
 5024   ins_pipe( pipe_slow );
 5025 %}
 5026 
 5027 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5028   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5029   match(Set dst (AddReductionVF dst src));
 5030   match(Set dst (MulReductionVF dst src));
 5031   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5032   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5033   ins_encode %{
 5034     int opcode = this->ideal_Opcode();
 5035     int vlen = Matcher::vector_length(this, $src);
 5036     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5037   %}
 5038   ins_pipe( pipe_slow );
 5039 %}
 5040 
 5041 
 5042 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5043   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5044   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5045   // src1 contains reduction identity
 5046   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5047   match(Set dst (AddReductionVF src1 src2));
 5048   match(Set dst (MulReductionVF src1 src2));
 5049   effect(TEMP dst);
 5050   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5051   ins_encode %{
 5052     int opcode = this->ideal_Opcode();
 5053     int vlen = Matcher::vector_length(this, $src2);
 5054     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5055   %}
 5056   ins_pipe( pipe_slow );
 5057 %}
 5058 
 5059 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5060   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5061   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5062   // src1 contains reduction identity
 5063   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5064   match(Set dst (AddReductionVF src1 src2));
 5065   match(Set dst (MulReductionVF src1 src2));
 5066   effect(TEMP dst, TEMP vtmp);
 5067   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5068   ins_encode %{
 5069     int opcode = this->ideal_Opcode();
 5070     int vlen = Matcher::vector_length(this, $src2);
 5071     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5072   %}
 5073   ins_pipe( pipe_slow );
 5074 %}
 5075 
 5076 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5077   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5078   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5079   // src1 contains reduction identity
 5080   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5081   match(Set dst (AddReductionVF src1 src2));
 5082   match(Set dst (MulReductionVF src1 src2));
 5083   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5084   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5085   ins_encode %{
 5086     int opcode = this->ideal_Opcode();
 5087     int vlen = Matcher::vector_length(this, $src2);
 5088     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5089   %}
 5090   ins_pipe( pipe_slow );
 5091 %}
 5092 
 5093 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5094   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5095   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5096   // src1 contains reduction identity
 5097   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5098   match(Set dst (AddReductionVF src1 src2));
 5099   match(Set dst (MulReductionVF src1 src2));
 5100   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5101   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5102   ins_encode %{
 5103     int opcode = this->ideal_Opcode();
 5104     int vlen = Matcher::vector_length(this, $src2);
 5105     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5106   %}
 5107   ins_pipe( pipe_slow );
 5108 %}
 5109 
 5110 // =======================Double Reduction==========================================
 5111 
 5112 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5113   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5114   match(Set dst (AddReductionVD dst src));
 5115   match(Set dst (MulReductionVD dst src));
 5116   effect(TEMP dst, TEMP vtmp);
 5117   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5118   ins_encode %{
 5119     int opcode = this->ideal_Opcode();
 5120     int vlen = Matcher::vector_length(this, $src);
 5121     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5122 %}
 5123   ins_pipe( pipe_slow );
 5124 %}
 5125 
 5126 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5127   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5128   match(Set dst (AddReductionVD dst src));
 5129   match(Set dst (MulReductionVD dst src));
 5130   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5131   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5132   ins_encode %{
 5133     int opcode = this->ideal_Opcode();
 5134     int vlen = Matcher::vector_length(this, $src);
 5135     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5136   %}
 5137   ins_pipe( pipe_slow );
 5138 %}
 5139 
 5140 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5141   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5142   match(Set dst (AddReductionVD dst src));
 5143   match(Set dst (MulReductionVD dst src));
 5144   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5145   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5146   ins_encode %{
 5147     int opcode = this->ideal_Opcode();
 5148     int vlen = Matcher::vector_length(this, $src);
 5149     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5150   %}
 5151   ins_pipe( pipe_slow );
 5152 %}
 5153 
 5154 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5155   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5156   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5157   // src1 contains reduction identity
 5158   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5159   match(Set dst (AddReductionVD src1 src2));
 5160   match(Set dst (MulReductionVD src1 src2));
 5161   effect(TEMP dst);
 5162   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5163   ins_encode %{
 5164     int opcode = this->ideal_Opcode();
 5165     int vlen = Matcher::vector_length(this, $src2);
 5166     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5167 %}
 5168   ins_pipe( pipe_slow );
 5169 %}
 5170 
 5171 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5172   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5173   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5174   // src1 contains reduction identity
 5175   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5176   match(Set dst (AddReductionVD src1 src2));
 5177   match(Set dst (MulReductionVD src1 src2));
 5178   effect(TEMP dst, TEMP vtmp);
 5179   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5180   ins_encode %{
 5181     int opcode = this->ideal_Opcode();
 5182     int vlen = Matcher::vector_length(this, $src2);
 5183     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5184   %}
 5185   ins_pipe( pipe_slow );
 5186 %}
 5187 
 5188 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5189   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5190   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5191   // src1 contains reduction identity
 5192   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5193   match(Set dst (AddReductionVD src1 src2));
 5194   match(Set dst (MulReductionVD src1 src2));
 5195   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5196   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5197   ins_encode %{
 5198     int opcode = this->ideal_Opcode();
 5199     int vlen = Matcher::vector_length(this, $src2);
 5200     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5201   %}
 5202   ins_pipe( pipe_slow );
 5203 %}
 5204 
 5205 // =======================Byte Reduction==========================================
 5206 
 5207 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5208   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5209   match(Set dst (AddReductionVI src1 src2));
 5210   match(Set dst (AndReductionV  src1 src2));
 5211   match(Set dst ( OrReductionV  src1 src2));
 5212   match(Set dst (XorReductionV  src1 src2));
 5213   match(Set dst (MinReductionV  src1 src2));
 5214   match(Set dst (MaxReductionV  src1 src2));
 5215   effect(TEMP vtmp1, TEMP vtmp2);
 5216   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5217   ins_encode %{
 5218     int opcode = this->ideal_Opcode();
 5219     int vlen = Matcher::vector_length(this, $src2);
 5220     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5221   %}
 5222   ins_pipe( pipe_slow );
 5223 %}
 5224 
 5225 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5226   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5227   match(Set dst (AddReductionVI src1 src2));
 5228   match(Set dst (AndReductionV  src1 src2));
 5229   match(Set dst ( OrReductionV  src1 src2));
 5230   match(Set dst (XorReductionV  src1 src2));
 5231   match(Set dst (MinReductionV  src1 src2));
 5232   match(Set dst (MaxReductionV  src1 src2));
 5233   effect(TEMP vtmp1, TEMP vtmp2);
 5234   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5235   ins_encode %{
 5236     int opcode = this->ideal_Opcode();
 5237     int vlen = Matcher::vector_length(this, $src2);
 5238     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5239   %}
 5240   ins_pipe( pipe_slow );
 5241 %}
 5242 
 5243 // =======================Short Reduction==========================================
 5244 
 5245 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5246   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5247   match(Set dst (AddReductionVI src1 src2));
 5248   match(Set dst (MulReductionVI src1 src2));
 5249   match(Set dst (AndReductionV  src1 src2));
 5250   match(Set dst ( OrReductionV  src1 src2));
 5251   match(Set dst (XorReductionV  src1 src2));
 5252   match(Set dst (MinReductionV  src1 src2));
 5253   match(Set dst (MaxReductionV  src1 src2));
 5254   effect(TEMP vtmp1, TEMP vtmp2);
 5255   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5256   ins_encode %{
 5257     int opcode = this->ideal_Opcode();
 5258     int vlen = Matcher::vector_length(this, $src2);
 5259     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5260   %}
 5261   ins_pipe( pipe_slow );
 5262 %}
 5263 
 5264 // =======================Mul Reduction==========================================
 5265 
 5266 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5267   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5268             Matcher::vector_length(n->in(2)) <= 32); // src2
 5269   match(Set dst (MulReductionVI src1 src2));
 5270   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5271   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5272   ins_encode %{
 5273     int opcode = this->ideal_Opcode();
 5274     int vlen = Matcher::vector_length(this, $src2);
 5275     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5276   %}
 5277   ins_pipe( pipe_slow );
 5278 %}
 5279 
 5280 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5281   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5282             Matcher::vector_length(n->in(2)) == 64); // src2
 5283   match(Set dst (MulReductionVI src1 src2));
 5284   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5285   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5286   ins_encode %{
 5287     int opcode = this->ideal_Opcode();
 5288     int vlen = Matcher::vector_length(this, $src2);
 5289     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5290   %}
 5291   ins_pipe( pipe_slow );
 5292 %}
 5293 
 5294 //--------------------Min/Max Float Reduction --------------------
 5295 // Float Min Reduction
 5296 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5297                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5298   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5299             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5300              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5301             Matcher::vector_length(n->in(2)) == 2);
 5302   match(Set dst (MinReductionV src1 src2));
 5303   match(Set dst (MaxReductionV src1 src2));
 5304   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5305   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5306   ins_encode %{
 5307     assert(UseAVX > 0, "sanity");
 5308 
 5309     int opcode = this->ideal_Opcode();
 5310     int vlen = Matcher::vector_length(this, $src2);
 5311     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5312                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5313   %}
 5314   ins_pipe( pipe_slow );
 5315 %}
 5316 
 5317 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5318                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5319   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5320             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5321              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5322             Matcher::vector_length(n->in(2)) >= 4);
 5323   match(Set dst (MinReductionV src1 src2));
 5324   match(Set dst (MaxReductionV src1 src2));
 5325   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5326   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5327   ins_encode %{
 5328     assert(UseAVX > 0, "sanity");
 5329 
 5330     int opcode = this->ideal_Opcode();
 5331     int vlen = Matcher::vector_length(this, $src2);
 5332     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5333                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5334   %}
 5335   ins_pipe( pipe_slow );
 5336 %}
 5337 
 5338 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5339                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5340   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5341             Matcher::vector_length(n->in(2)) == 2);
 5342   match(Set dst (MinReductionV dst src));
 5343   match(Set dst (MaxReductionV dst src));
 5344   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5345   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5346   ins_encode %{
 5347     assert(UseAVX > 0, "sanity");
 5348 
 5349     int opcode = this->ideal_Opcode();
 5350     int vlen = Matcher::vector_length(this, $src);
 5351     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5352                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5353   %}
 5354   ins_pipe( pipe_slow );
 5355 %}
 5356 
 5357 
 5358 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5359                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5360   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5361             Matcher::vector_length(n->in(2)) >= 4);
 5362   match(Set dst (MinReductionV dst src));
 5363   match(Set dst (MaxReductionV dst src));
 5364   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5365   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5366   ins_encode %{
 5367     assert(UseAVX > 0, "sanity");
 5368 
 5369     int opcode = this->ideal_Opcode();
 5370     int vlen = Matcher::vector_length(this, $src);
 5371     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5372                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5373   %}
 5374   ins_pipe( pipe_slow );
 5375 %}
 5376 
 5377 
 5378 //--------------------Min Double Reduction --------------------
 5379 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5380                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5381                             rFlagsReg cr) %{
 5382   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5383             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5384              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5385             Matcher::vector_length(n->in(2)) == 2);
 5386   match(Set dst (MinReductionV src1 src2));
 5387   match(Set dst (MaxReductionV src1 src2));
 5388   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5389   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5390   ins_encode %{
 5391     assert(UseAVX > 0, "sanity");
 5392 
 5393     int opcode = this->ideal_Opcode();
 5394     int vlen = Matcher::vector_length(this, $src2);
 5395     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5396                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5397   %}
 5398   ins_pipe( pipe_slow );
 5399 %}
 5400 
 5401 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5402                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5403                            rFlagsReg cr) %{
 5404   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5405             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5406              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5407             Matcher::vector_length(n->in(2)) >= 4);
 5408   match(Set dst (MinReductionV src1 src2));
 5409   match(Set dst (MaxReductionV src1 src2));
 5410   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5411   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5412   ins_encode %{
 5413     assert(UseAVX > 0, "sanity");
 5414 
 5415     int opcode = this->ideal_Opcode();
 5416     int vlen = Matcher::vector_length(this, $src2);
 5417     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5418                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5419   %}
 5420   ins_pipe( pipe_slow );
 5421 %}
 5422 
 5423 
 5424 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5425                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5426                                rFlagsReg cr) %{
 5427   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5428             Matcher::vector_length(n->in(2)) == 2);
 5429   match(Set dst (MinReductionV dst src));
 5430   match(Set dst (MaxReductionV dst src));
 5431   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5432   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5433   ins_encode %{
 5434     assert(UseAVX > 0, "sanity");
 5435 
 5436     int opcode = this->ideal_Opcode();
 5437     int vlen = Matcher::vector_length(this, $src);
 5438     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5439                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5440   %}
 5441   ins_pipe( pipe_slow );
 5442 %}
 5443 
 5444 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5445                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5446                               rFlagsReg cr) %{
 5447   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5448             Matcher::vector_length(n->in(2)) >= 4);
 5449   match(Set dst (MinReductionV dst src));
 5450   match(Set dst (MaxReductionV dst src));
 5451   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5452   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5453   ins_encode %{
 5454     assert(UseAVX > 0, "sanity");
 5455 
 5456     int opcode = this->ideal_Opcode();
 5457     int vlen = Matcher::vector_length(this, $src);
 5458     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5459                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5460   %}
 5461   ins_pipe( pipe_slow );
 5462 %}
 5463 
 5464 // ====================VECTOR ARITHMETIC=======================================
 5465 
 5466 // --------------------------------- ADD --------------------------------------
 5467 
 5468 // Bytes vector add
 5469 instruct vaddB(vec dst, vec src) %{
 5470   predicate(UseAVX == 0);
 5471   match(Set dst (AddVB dst src));
 5472   format %{ "paddb   $dst,$src\t! add packedB" %}
 5473   ins_encode %{
 5474     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5475   %}
 5476   ins_pipe( pipe_slow );
 5477 %}
 5478 
 5479 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5480   predicate(UseAVX > 0);
 5481   match(Set dst (AddVB src1 src2));
 5482   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5483   ins_encode %{
 5484     int vlen_enc = vector_length_encoding(this);
 5485     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5486   %}
 5487   ins_pipe( pipe_slow );
 5488 %}
 5489 
 5490 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5491   predicate((UseAVX > 0) &&
 5492             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5493   match(Set dst (AddVB src (LoadVector mem)));
 5494   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5495   ins_encode %{
 5496     int vlen_enc = vector_length_encoding(this);
 5497     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5498   %}
 5499   ins_pipe( pipe_slow );
 5500 %}
 5501 
 5502 // Shorts/Chars vector add
 5503 instruct vaddS(vec dst, vec src) %{
 5504   predicate(UseAVX == 0);
 5505   match(Set dst (AddVS dst src));
 5506   format %{ "paddw   $dst,$src\t! add packedS" %}
 5507   ins_encode %{
 5508     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5509   %}
 5510   ins_pipe( pipe_slow );
 5511 %}
 5512 
 5513 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5514   predicate(UseAVX > 0);
 5515   match(Set dst (AddVS src1 src2));
 5516   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5517   ins_encode %{
 5518     int vlen_enc = vector_length_encoding(this);
 5519     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5520   %}
 5521   ins_pipe( pipe_slow );
 5522 %}
 5523 
 5524 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5525   predicate((UseAVX > 0) &&
 5526             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5527   match(Set dst (AddVS src (LoadVector mem)));
 5528   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5529   ins_encode %{
 5530     int vlen_enc = vector_length_encoding(this);
 5531     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5532   %}
 5533   ins_pipe( pipe_slow );
 5534 %}
 5535 
 5536 // Integers vector add
 5537 instruct vaddI(vec dst, vec src) %{
 5538   predicate(UseAVX == 0);
 5539   match(Set dst (AddVI dst src));
 5540   format %{ "paddd   $dst,$src\t! add packedI" %}
 5541   ins_encode %{
 5542     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5543   %}
 5544   ins_pipe( pipe_slow );
 5545 %}
 5546 
 5547 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5548   predicate(UseAVX > 0);
 5549   match(Set dst (AddVI src1 src2));
 5550   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5551   ins_encode %{
 5552     int vlen_enc = vector_length_encoding(this);
 5553     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5554   %}
 5555   ins_pipe( pipe_slow );
 5556 %}
 5557 
 5558 
 5559 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5560   predicate((UseAVX > 0) &&
 5561             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5562   match(Set dst (AddVI src (LoadVector mem)));
 5563   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5564   ins_encode %{
 5565     int vlen_enc = vector_length_encoding(this);
 5566     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5567   %}
 5568   ins_pipe( pipe_slow );
 5569 %}
 5570 
 5571 // Longs vector add
 5572 instruct vaddL(vec dst, vec src) %{
 5573   predicate(UseAVX == 0);
 5574   match(Set dst (AddVL dst src));
 5575   format %{ "paddq   $dst,$src\t! add packedL" %}
 5576   ins_encode %{
 5577     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5578   %}
 5579   ins_pipe( pipe_slow );
 5580 %}
 5581 
 5582 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5583   predicate(UseAVX > 0);
 5584   match(Set dst (AddVL src1 src2));
 5585   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5586   ins_encode %{
 5587     int vlen_enc = vector_length_encoding(this);
 5588     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5589   %}
 5590   ins_pipe( pipe_slow );
 5591 %}
 5592 
 5593 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5594   predicate((UseAVX > 0) &&
 5595             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5596   match(Set dst (AddVL src (LoadVector mem)));
 5597   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5598   ins_encode %{
 5599     int vlen_enc = vector_length_encoding(this);
 5600     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5601   %}
 5602   ins_pipe( pipe_slow );
 5603 %}
 5604 
 5605 // Floats vector add
 5606 instruct vaddF(vec dst, vec src) %{
 5607   predicate(UseAVX == 0);
 5608   match(Set dst (AddVF dst src));
 5609   format %{ "addps   $dst,$src\t! add packedF" %}
 5610   ins_encode %{
 5611     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5612   %}
 5613   ins_pipe( pipe_slow );
 5614 %}
 5615 
 5616 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5617   predicate(UseAVX > 0);
 5618   match(Set dst (AddVF src1 src2));
 5619   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5620   ins_encode %{
 5621     int vlen_enc = vector_length_encoding(this);
 5622     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5623   %}
 5624   ins_pipe( pipe_slow );
 5625 %}
 5626 
 5627 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5628   predicate((UseAVX > 0) &&
 5629             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5630   match(Set dst (AddVF src (LoadVector mem)));
 5631   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5632   ins_encode %{
 5633     int vlen_enc = vector_length_encoding(this);
 5634     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5635   %}
 5636   ins_pipe( pipe_slow );
 5637 %}
 5638 
 5639 // Doubles vector add
 5640 instruct vaddD(vec dst, vec src) %{
 5641   predicate(UseAVX == 0);
 5642   match(Set dst (AddVD dst src));
 5643   format %{ "addpd   $dst,$src\t! add packedD" %}
 5644   ins_encode %{
 5645     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5646   %}
 5647   ins_pipe( pipe_slow );
 5648 %}
 5649 
 5650 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5651   predicate(UseAVX > 0);
 5652   match(Set dst (AddVD src1 src2));
 5653   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5654   ins_encode %{
 5655     int vlen_enc = vector_length_encoding(this);
 5656     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5657   %}
 5658   ins_pipe( pipe_slow );
 5659 %}
 5660 
 5661 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5662   predicate((UseAVX > 0) &&
 5663             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5664   match(Set dst (AddVD src (LoadVector mem)));
 5665   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5666   ins_encode %{
 5667     int vlen_enc = vector_length_encoding(this);
 5668     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5669   %}
 5670   ins_pipe( pipe_slow );
 5671 %}
 5672 
 5673 // --------------------------------- SUB --------------------------------------
 5674 
 5675 // Bytes vector sub
 5676 instruct vsubB(vec dst, vec src) %{
 5677   predicate(UseAVX == 0);
 5678   match(Set dst (SubVB dst src));
 5679   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5680   ins_encode %{
 5681     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5682   %}
 5683   ins_pipe( pipe_slow );
 5684 %}
 5685 
 5686 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5687   predicate(UseAVX > 0);
 5688   match(Set dst (SubVB src1 src2));
 5689   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5690   ins_encode %{
 5691     int vlen_enc = vector_length_encoding(this);
 5692     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5693   %}
 5694   ins_pipe( pipe_slow );
 5695 %}
 5696 
 5697 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5698   predicate((UseAVX > 0) &&
 5699             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5700   match(Set dst (SubVB src (LoadVector mem)));
 5701   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5702   ins_encode %{
 5703     int vlen_enc = vector_length_encoding(this);
 5704     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5705   %}
 5706   ins_pipe( pipe_slow );
 5707 %}
 5708 
 5709 // Shorts/Chars vector sub
 5710 instruct vsubS(vec dst, vec src) %{
 5711   predicate(UseAVX == 0);
 5712   match(Set dst (SubVS dst src));
 5713   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5714   ins_encode %{
 5715     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5716   %}
 5717   ins_pipe( pipe_slow );
 5718 %}
 5719 
 5720 
 5721 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5722   predicate(UseAVX > 0);
 5723   match(Set dst (SubVS src1 src2));
 5724   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5725   ins_encode %{
 5726     int vlen_enc = vector_length_encoding(this);
 5727     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5728   %}
 5729   ins_pipe( pipe_slow );
 5730 %}
 5731 
 5732 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5733   predicate((UseAVX > 0) &&
 5734             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5735   match(Set dst (SubVS src (LoadVector mem)));
 5736   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5737   ins_encode %{
 5738     int vlen_enc = vector_length_encoding(this);
 5739     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5740   %}
 5741   ins_pipe( pipe_slow );
 5742 %}
 5743 
 5744 // Integers vector sub
 5745 instruct vsubI(vec dst, vec src) %{
 5746   predicate(UseAVX == 0);
 5747   match(Set dst (SubVI dst src));
 5748   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5749   ins_encode %{
 5750     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5751   %}
 5752   ins_pipe( pipe_slow );
 5753 %}
 5754 
 5755 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5756   predicate(UseAVX > 0);
 5757   match(Set dst (SubVI src1 src2));
 5758   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5759   ins_encode %{
 5760     int vlen_enc = vector_length_encoding(this);
 5761     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5762   %}
 5763   ins_pipe( pipe_slow );
 5764 %}
 5765 
 5766 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5767   predicate((UseAVX > 0) &&
 5768             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5769   match(Set dst (SubVI src (LoadVector mem)));
 5770   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5771   ins_encode %{
 5772     int vlen_enc = vector_length_encoding(this);
 5773     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5774   %}
 5775   ins_pipe( pipe_slow );
 5776 %}
 5777 
 5778 // Longs vector sub
 5779 instruct vsubL(vec dst, vec src) %{
 5780   predicate(UseAVX == 0);
 5781   match(Set dst (SubVL dst src));
 5782   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5783   ins_encode %{
 5784     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5785   %}
 5786   ins_pipe( pipe_slow );
 5787 %}
 5788 
 5789 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5790   predicate(UseAVX > 0);
 5791   match(Set dst (SubVL src1 src2));
 5792   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5793   ins_encode %{
 5794     int vlen_enc = vector_length_encoding(this);
 5795     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5796   %}
 5797   ins_pipe( pipe_slow );
 5798 %}
 5799 
 5800 
 5801 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5802   predicate((UseAVX > 0) &&
 5803             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5804   match(Set dst (SubVL src (LoadVector mem)));
 5805   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5806   ins_encode %{
 5807     int vlen_enc = vector_length_encoding(this);
 5808     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5809   %}
 5810   ins_pipe( pipe_slow );
 5811 %}
 5812 
 5813 // Floats vector sub
 5814 instruct vsubF(vec dst, vec src) %{
 5815   predicate(UseAVX == 0);
 5816   match(Set dst (SubVF dst src));
 5817   format %{ "subps   $dst,$src\t! sub packedF" %}
 5818   ins_encode %{
 5819     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5820   %}
 5821   ins_pipe( pipe_slow );
 5822 %}
 5823 
 5824 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5825   predicate(UseAVX > 0);
 5826   match(Set dst (SubVF src1 src2));
 5827   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5828   ins_encode %{
 5829     int vlen_enc = vector_length_encoding(this);
 5830     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5831   %}
 5832   ins_pipe( pipe_slow );
 5833 %}
 5834 
 5835 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5836   predicate((UseAVX > 0) &&
 5837             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5838   match(Set dst (SubVF src (LoadVector mem)));
 5839   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5840   ins_encode %{
 5841     int vlen_enc = vector_length_encoding(this);
 5842     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5843   %}
 5844   ins_pipe( pipe_slow );
 5845 %}
 5846 
 5847 // Doubles vector sub
 5848 instruct vsubD(vec dst, vec src) %{
 5849   predicate(UseAVX == 0);
 5850   match(Set dst (SubVD dst src));
 5851   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5852   ins_encode %{
 5853     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5854   %}
 5855   ins_pipe( pipe_slow );
 5856 %}
 5857 
 5858 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5859   predicate(UseAVX > 0);
 5860   match(Set dst (SubVD src1 src2));
 5861   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5862   ins_encode %{
 5863     int vlen_enc = vector_length_encoding(this);
 5864     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5865   %}
 5866   ins_pipe( pipe_slow );
 5867 %}
 5868 
 5869 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5870   predicate((UseAVX > 0) &&
 5871             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5872   match(Set dst (SubVD src (LoadVector mem)));
 5873   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5874   ins_encode %{
 5875     int vlen_enc = vector_length_encoding(this);
 5876     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5877   %}
 5878   ins_pipe( pipe_slow );
 5879 %}
 5880 
 5881 // --------------------------------- MUL --------------------------------------
 5882 
 5883 // Byte vector mul
 5884 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5885   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5886   match(Set dst (MulVB src1 src2));
 5887   effect(TEMP dst, TEMP xtmp);
 5888   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5889   ins_encode %{
 5890     assert(UseSSE > 3, "required");
 5891     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5892     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5893     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5894     __ psllw($dst$$XMMRegister, 8);
 5895     __ psrlw($dst$$XMMRegister, 8);
 5896     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5897   %}
 5898   ins_pipe( pipe_slow );
 5899 %}
 5900 
 5901 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5902   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5903   match(Set dst (MulVB src1 src2));
 5904   effect(TEMP dst, TEMP xtmp);
 5905   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5906   ins_encode %{
 5907     assert(UseSSE > 3, "required");
 5908     // Odd-index elements
 5909     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5910     __ psrlw($dst$$XMMRegister, 8);
 5911     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5912     __ psrlw($xtmp$$XMMRegister, 8);
 5913     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5914     __ psllw($dst$$XMMRegister, 8);
 5915     // Even-index elements
 5916     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5917     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5918     __ psllw($xtmp$$XMMRegister, 8);
 5919     __ psrlw($xtmp$$XMMRegister, 8);
 5920     // Combine
 5921     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5922   %}
 5923   ins_pipe( pipe_slow );
 5924 %}
 5925 
 5926 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5927   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5928   match(Set dst (MulVB src1 src2));
 5929   effect(TEMP xtmp1, TEMP xtmp2);
 5930   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5931   ins_encode %{
 5932     int vlen_enc = vector_length_encoding(this);
 5933     // Odd-index elements
 5934     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5935     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5936     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5937     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5938     // Even-index elements
 5939     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5940     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5941     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5942     // Combine
 5943     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5944   %}
 5945   ins_pipe( pipe_slow );
 5946 %}
 5947 
 5948 // Shorts/Chars vector mul
 5949 instruct vmulS(vec dst, vec src) %{
 5950   predicate(UseAVX == 0);
 5951   match(Set dst (MulVS dst src));
 5952   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5953   ins_encode %{
 5954     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5955   %}
 5956   ins_pipe( pipe_slow );
 5957 %}
 5958 
 5959 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5960   predicate(UseAVX > 0);
 5961   match(Set dst (MulVS src1 src2));
 5962   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5963   ins_encode %{
 5964     int vlen_enc = vector_length_encoding(this);
 5965     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5966   %}
 5967   ins_pipe( pipe_slow );
 5968 %}
 5969 
 5970 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5971   predicate((UseAVX > 0) &&
 5972             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5973   match(Set dst (MulVS src (LoadVector mem)));
 5974   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5975   ins_encode %{
 5976     int vlen_enc = vector_length_encoding(this);
 5977     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5978   %}
 5979   ins_pipe( pipe_slow );
 5980 %}
 5981 
 5982 // Integers vector mul
 5983 instruct vmulI(vec dst, vec src) %{
 5984   predicate(UseAVX == 0);
 5985   match(Set dst (MulVI dst src));
 5986   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5987   ins_encode %{
 5988     assert(UseSSE > 3, "required");
 5989     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5990   %}
 5991   ins_pipe( pipe_slow );
 5992 %}
 5993 
 5994 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5995   predicate(UseAVX > 0);
 5996   match(Set dst (MulVI src1 src2));
 5997   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5998   ins_encode %{
 5999     int vlen_enc = vector_length_encoding(this);
 6000     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6001   %}
 6002   ins_pipe( pipe_slow );
 6003 %}
 6004 
 6005 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6006   predicate((UseAVX > 0) &&
 6007             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6008   match(Set dst (MulVI src (LoadVector mem)));
 6009   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6010   ins_encode %{
 6011     int vlen_enc = vector_length_encoding(this);
 6012     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6013   %}
 6014   ins_pipe( pipe_slow );
 6015 %}
 6016 
 6017 // Longs vector mul
 6018 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6019   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6020              VM_Version::supports_avx512dq()) ||
 6021             VM_Version::supports_avx512vldq());
 6022   match(Set dst (MulVL src1 src2));
 6023   ins_cost(500);
 6024   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6025   ins_encode %{
 6026     assert(UseAVX > 2, "required");
 6027     int vlen_enc = vector_length_encoding(this);
 6028     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6029   %}
 6030   ins_pipe( pipe_slow );
 6031 %}
 6032 
 6033 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6034   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6035              VM_Version::supports_avx512dq()) ||
 6036             (Matcher::vector_length_in_bytes(n) > 8 &&
 6037              VM_Version::supports_avx512vldq()));
 6038   match(Set dst (MulVL src (LoadVector mem)));
 6039   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6040   ins_cost(500);
 6041   ins_encode %{
 6042     assert(UseAVX > 2, "required");
 6043     int vlen_enc = vector_length_encoding(this);
 6044     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6045   %}
 6046   ins_pipe( pipe_slow );
 6047 %}
 6048 
 6049 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6050   predicate(UseAVX == 0);
 6051   match(Set dst (MulVL src1 src2));
 6052   ins_cost(500);
 6053   effect(TEMP dst, TEMP xtmp);
 6054   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6055   ins_encode %{
 6056     assert(VM_Version::supports_sse4_1(), "required");
 6057     // Get the lo-hi products, only the lower 32 bits is in concerns
 6058     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6059     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6060     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6061     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6062     __ psllq($dst$$XMMRegister, 32);
 6063     // Get the lo-lo products
 6064     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6065     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6066     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6067   %}
 6068   ins_pipe( pipe_slow );
 6069 %}
 6070 
 6071 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6072   predicate(UseAVX > 0 &&
 6073             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6074               !VM_Version::supports_avx512dq()) ||
 6075              (Matcher::vector_length_in_bytes(n) < 64 &&
 6076               !VM_Version::supports_avx512vldq())));
 6077   match(Set dst (MulVL src1 src2));
 6078   effect(TEMP xtmp1, TEMP xtmp2);
 6079   ins_cost(500);
 6080   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6081   ins_encode %{
 6082     int vlen_enc = vector_length_encoding(this);
 6083     // Get the lo-hi products, only the lower 32 bits is in concerns
 6084     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6085     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6086     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6087     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6088     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6089     // Get the lo-lo products
 6090     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6091     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6092   %}
 6093   ins_pipe( pipe_slow );
 6094 %}
 6095 
 6096 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6097   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6098   match(Set dst (MulVL src1 src2));
 6099   ins_cost(100);
 6100   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6101   ins_encode %{
 6102     int vlen_enc = vector_length_encoding(this);
 6103     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6104   %}
 6105   ins_pipe( pipe_slow );
 6106 %}
 6107 
 6108 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6109   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6110   match(Set dst (MulVL src1 src2));
 6111   ins_cost(100);
 6112   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6113   ins_encode %{
 6114     int vlen_enc = vector_length_encoding(this);
 6115     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6116   %}
 6117   ins_pipe( pipe_slow );
 6118 %}
 6119 
 6120 // Floats vector mul
 6121 instruct vmulF(vec dst, vec src) %{
 6122   predicate(UseAVX == 0);
 6123   match(Set dst (MulVF dst src));
 6124   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6125   ins_encode %{
 6126     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6127   %}
 6128   ins_pipe( pipe_slow );
 6129 %}
 6130 
 6131 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6132   predicate(UseAVX > 0);
 6133   match(Set dst (MulVF src1 src2));
 6134   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6135   ins_encode %{
 6136     int vlen_enc = vector_length_encoding(this);
 6137     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6138   %}
 6139   ins_pipe( pipe_slow );
 6140 %}
 6141 
 6142 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6143   predicate((UseAVX > 0) &&
 6144             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6145   match(Set dst (MulVF src (LoadVector mem)));
 6146   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6147   ins_encode %{
 6148     int vlen_enc = vector_length_encoding(this);
 6149     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6150   %}
 6151   ins_pipe( pipe_slow );
 6152 %}
 6153 
 6154 // Doubles vector mul
 6155 instruct vmulD(vec dst, vec src) %{
 6156   predicate(UseAVX == 0);
 6157   match(Set dst (MulVD dst src));
 6158   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6159   ins_encode %{
 6160     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6161   %}
 6162   ins_pipe( pipe_slow );
 6163 %}
 6164 
 6165 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6166   predicate(UseAVX > 0);
 6167   match(Set dst (MulVD src1 src2));
 6168   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6169   ins_encode %{
 6170     int vlen_enc = vector_length_encoding(this);
 6171     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6172   %}
 6173   ins_pipe( pipe_slow );
 6174 %}
 6175 
 6176 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6177   predicate((UseAVX > 0) &&
 6178             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6179   match(Set dst (MulVD src (LoadVector mem)));
 6180   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6181   ins_encode %{
 6182     int vlen_enc = vector_length_encoding(this);
 6183     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6184   %}
 6185   ins_pipe( pipe_slow );
 6186 %}
 6187 
 6188 // --------------------------------- DIV --------------------------------------
 6189 
 6190 // Floats vector div
 6191 instruct vdivF(vec dst, vec src) %{
 6192   predicate(UseAVX == 0);
 6193   match(Set dst (DivVF dst src));
 6194   format %{ "divps   $dst,$src\t! div packedF" %}
 6195   ins_encode %{
 6196     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6197   %}
 6198   ins_pipe( pipe_slow );
 6199 %}
 6200 
 6201 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6202   predicate(UseAVX > 0);
 6203   match(Set dst (DivVF src1 src2));
 6204   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6205   ins_encode %{
 6206     int vlen_enc = vector_length_encoding(this);
 6207     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6208   %}
 6209   ins_pipe( pipe_slow );
 6210 %}
 6211 
 6212 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6213   predicate((UseAVX > 0) &&
 6214             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6215   match(Set dst (DivVF src (LoadVector mem)));
 6216   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6217   ins_encode %{
 6218     int vlen_enc = vector_length_encoding(this);
 6219     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6220   %}
 6221   ins_pipe( pipe_slow );
 6222 %}
 6223 
 6224 // Doubles vector div
 6225 instruct vdivD(vec dst, vec src) %{
 6226   predicate(UseAVX == 0);
 6227   match(Set dst (DivVD dst src));
 6228   format %{ "divpd   $dst,$src\t! div packedD" %}
 6229   ins_encode %{
 6230     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6231   %}
 6232   ins_pipe( pipe_slow );
 6233 %}
 6234 
 6235 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6236   predicate(UseAVX > 0);
 6237   match(Set dst (DivVD src1 src2));
 6238   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6239   ins_encode %{
 6240     int vlen_enc = vector_length_encoding(this);
 6241     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6242   %}
 6243   ins_pipe( pipe_slow );
 6244 %}
 6245 
 6246 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6247   predicate((UseAVX > 0) &&
 6248             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6249   match(Set dst (DivVD src (LoadVector mem)));
 6250   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6251   ins_encode %{
 6252     int vlen_enc = vector_length_encoding(this);
 6253     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6254   %}
 6255   ins_pipe( pipe_slow );
 6256 %}
 6257 
 6258 // ------------------------------ MinMax ---------------------------------------
 6259 
 6260 // Byte, Short, Int vector Min/Max
 6261 instruct minmax_reg_sse(vec dst, vec src) %{
 6262   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6263             UseAVX == 0);
 6264   match(Set dst (MinV dst src));
 6265   match(Set dst (MaxV dst src));
 6266   format %{ "vector_minmax  $dst,$src\t!  " %}
 6267   ins_encode %{
 6268     assert(UseSSE >= 4, "required");
 6269 
 6270     int opcode = this->ideal_Opcode();
 6271     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6272     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6273   %}
 6274   ins_pipe( pipe_slow );
 6275 %}
 6276 
 6277 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6278   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6279             UseAVX > 0);
 6280   match(Set dst (MinV src1 src2));
 6281   match(Set dst (MaxV src1 src2));
 6282   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6283   ins_encode %{
 6284     int opcode = this->ideal_Opcode();
 6285     int vlen_enc = vector_length_encoding(this);
 6286     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6287 
 6288     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6289   %}
 6290   ins_pipe( pipe_slow );
 6291 %}
 6292 
 6293 // Long vector Min/Max
 6294 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6295   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6296             UseAVX == 0);
 6297   match(Set dst (MinV dst src));
 6298   match(Set dst (MaxV src dst));
 6299   effect(TEMP dst, TEMP tmp);
 6300   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6301   ins_encode %{
 6302     assert(UseSSE >= 4, "required");
 6303 
 6304     int opcode = this->ideal_Opcode();
 6305     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6306     assert(elem_bt == T_LONG, "sanity");
 6307 
 6308     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6309   %}
 6310   ins_pipe( pipe_slow );
 6311 %}
 6312 
 6313 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6314   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6315             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6316   match(Set dst (MinV src1 src2));
 6317   match(Set dst (MaxV src1 src2));
 6318   effect(TEMP dst);
 6319   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6320   ins_encode %{
 6321     int vlen_enc = vector_length_encoding(this);
 6322     int opcode = this->ideal_Opcode();
 6323     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6324     assert(elem_bt == T_LONG, "sanity");
 6325 
 6326     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6327   %}
 6328   ins_pipe( pipe_slow );
 6329 %}
 6330 
 6331 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6332   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6333             Matcher::vector_element_basic_type(n) == T_LONG);
 6334   match(Set dst (MinV src1 src2));
 6335   match(Set dst (MaxV src1 src2));
 6336   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6337   ins_encode %{
 6338     assert(UseAVX > 2, "required");
 6339 
 6340     int vlen_enc = vector_length_encoding(this);
 6341     int opcode = this->ideal_Opcode();
 6342     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6343     assert(elem_bt == T_LONG, "sanity");
 6344 
 6345     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6346   %}
 6347   ins_pipe( pipe_slow );
 6348 %}
 6349 
 6350 // Float/Double vector Min/Max
 6351 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6352   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6353             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6354             UseAVX > 0);
 6355   match(Set dst (MinV a b));
 6356   match(Set dst (MaxV a b));
 6357   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6358   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6359   ins_encode %{
 6360     assert(UseAVX > 0, "required");
 6361 
 6362     int opcode = this->ideal_Opcode();
 6363     int vlen_enc = vector_length_encoding(this);
 6364     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6365 
 6366     __ vminmax_fp(opcode, elem_bt,
 6367                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6368                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6369   %}
 6370   ins_pipe( pipe_slow );
 6371 %}
 6372 
 6373 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6374   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6375             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6376   match(Set dst (MinV a b));
 6377   match(Set dst (MaxV a b));
 6378   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6379   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6380   ins_encode %{
 6381     assert(UseAVX > 2, "required");
 6382 
 6383     int opcode = this->ideal_Opcode();
 6384     int vlen_enc = vector_length_encoding(this);
 6385     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6386 
 6387     __ evminmax_fp(opcode, elem_bt,
 6388                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6389                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6390   %}
 6391   ins_pipe( pipe_slow );
 6392 %}
 6393 
 6394 // ------------------------------ Unsigned vector Min/Max ----------------------
 6395 
 6396 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6397   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6398   match(Set dst (UMinV a b));
 6399   match(Set dst (UMaxV a b));
 6400   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6401   ins_encode %{
 6402     int opcode = this->ideal_Opcode();
 6403     int vlen_enc = vector_length_encoding(this);
 6404     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6405     assert(is_integral_type(elem_bt), "");
 6406     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6407   %}
 6408   ins_pipe( pipe_slow );
 6409 %}
 6410 
 6411 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6412   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6413   match(Set dst (UMinV a (LoadVector b)));
 6414   match(Set dst (UMaxV a (LoadVector b)));
 6415   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6416   ins_encode %{
 6417     int opcode = this->ideal_Opcode();
 6418     int vlen_enc = vector_length_encoding(this);
 6419     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6420     assert(is_integral_type(elem_bt), "");
 6421     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6422   %}
 6423   ins_pipe( pipe_slow );
 6424 %}
 6425 
 6426 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6427   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6428   match(Set dst (UMinV a b));
 6429   match(Set dst (UMaxV a b));
 6430   effect(TEMP xtmp1, TEMP xtmp2);
 6431   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6432   ins_encode %{
 6433     int opcode = this->ideal_Opcode();
 6434     int vlen_enc = vector_length_encoding(this);
 6435     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6436   %}
 6437   ins_pipe( pipe_slow );
 6438 %}
 6439 
 6440 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6441   match(Set dst (UMinV (Binary dst src2) mask));
 6442   match(Set dst (UMaxV (Binary dst src2) mask));
 6443   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6444   ins_encode %{
 6445     int vlen_enc = vector_length_encoding(this);
 6446     BasicType bt = Matcher::vector_element_basic_type(this);
 6447     int opc = this->ideal_Opcode();
 6448     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6449                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6450   %}
 6451   ins_pipe( pipe_slow );
 6452 %}
 6453 
 6454 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6455   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6456   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6457   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6458   ins_encode %{
 6459     int vlen_enc = vector_length_encoding(this);
 6460     BasicType bt = Matcher::vector_element_basic_type(this);
 6461     int opc = this->ideal_Opcode();
 6462     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6463                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6464   %}
 6465   ins_pipe( pipe_slow );
 6466 %}
 6467 
 6468 // --------------------------------- Signum/CopySign ---------------------------
 6469 
 6470 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6471   match(Set dst (SignumF dst (Binary zero one)));
 6472   effect(KILL cr);
 6473   format %{ "signumF $dst, $dst" %}
 6474   ins_encode %{
 6475     int opcode = this->ideal_Opcode();
 6476     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6477   %}
 6478   ins_pipe( pipe_slow );
 6479 %}
 6480 
 6481 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6482   match(Set dst (SignumD dst (Binary zero one)));
 6483   effect(KILL cr);
 6484   format %{ "signumD $dst, $dst" %}
 6485   ins_encode %{
 6486     int opcode = this->ideal_Opcode();
 6487     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6488   %}
 6489   ins_pipe( pipe_slow );
 6490 %}
 6491 
 6492 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6493   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6494   match(Set dst (SignumVF src (Binary zero one)));
 6495   match(Set dst (SignumVD src (Binary zero one)));
 6496   effect(TEMP dst, TEMP xtmp1);
 6497   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6498   ins_encode %{
 6499     int opcode = this->ideal_Opcode();
 6500     int vec_enc = vector_length_encoding(this);
 6501     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6502                          $xtmp1$$XMMRegister, vec_enc);
 6503   %}
 6504   ins_pipe( pipe_slow );
 6505 %}
 6506 
 6507 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6508   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6509   match(Set dst (SignumVF src (Binary zero one)));
 6510   match(Set dst (SignumVD src (Binary zero one)));
 6511   effect(TEMP dst, TEMP ktmp1);
 6512   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6513   ins_encode %{
 6514     int opcode = this->ideal_Opcode();
 6515     int vec_enc = vector_length_encoding(this);
 6516     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6517                           $ktmp1$$KRegister, vec_enc);
 6518   %}
 6519   ins_pipe( pipe_slow );
 6520 %}
 6521 
 6522 // ---------------------------------------
 6523 // For copySign use 0xE4 as writemask for vpternlog
 6524 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6525 // C (xmm2) is set to 0x7FFFFFFF
 6526 // Wherever xmm2 is 0, we want to pick from B (sign)
 6527 // Wherever xmm2 is 1, we want to pick from A (src)
 6528 //
 6529 // A B C Result
 6530 // 0 0 0 0
 6531 // 0 0 1 0
 6532 // 0 1 0 1
 6533 // 0 1 1 0
 6534 // 1 0 0 0
 6535 // 1 0 1 1
 6536 // 1 1 0 1
 6537 // 1 1 1 1
 6538 //
 6539 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6540 // ---------------------------------------
 6541 
 6542 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6543   match(Set dst (CopySignF dst src));
 6544   effect(TEMP tmp1, TEMP tmp2);
 6545   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6546   ins_encode %{
 6547     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6548     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6549     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6550   %}
 6551   ins_pipe( pipe_slow );
 6552 %}
 6553 
 6554 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6555   match(Set dst (CopySignD dst (Binary src zero)));
 6556   ins_cost(100);
 6557   effect(TEMP tmp1, TEMP tmp2);
 6558   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6559   ins_encode %{
 6560     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6561     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6562     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6563   %}
 6564   ins_pipe( pipe_slow );
 6565 %}
 6566 
 6567 //----------------------------- CompressBits/ExpandBits ------------------------
 6568 
 6569 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6570   predicate(n->bottom_type()->isa_int());
 6571   match(Set dst (CompressBits src mask));
 6572   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6573   ins_encode %{
 6574     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6575   %}
 6576   ins_pipe( pipe_slow );
 6577 %}
 6578 
 6579 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6580   predicate(n->bottom_type()->isa_int());
 6581   match(Set dst (ExpandBits src mask));
 6582   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6583   ins_encode %{
 6584     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6585   %}
 6586   ins_pipe( pipe_slow );
 6587 %}
 6588 
 6589 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6590   predicate(n->bottom_type()->isa_int());
 6591   match(Set dst (CompressBits src (LoadI mask)));
 6592   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6593   ins_encode %{
 6594     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6595   %}
 6596   ins_pipe( pipe_slow );
 6597 %}
 6598 
 6599 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6600   predicate(n->bottom_type()->isa_int());
 6601   match(Set dst (ExpandBits src (LoadI mask)));
 6602   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6603   ins_encode %{
 6604     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6605   %}
 6606   ins_pipe( pipe_slow );
 6607 %}
 6608 
 6609 // --------------------------------- Sqrt --------------------------------------
 6610 
 6611 instruct vsqrtF_reg(vec dst, vec src) %{
 6612   match(Set dst (SqrtVF src));
 6613   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6614   ins_encode %{
 6615     assert(UseAVX > 0, "required");
 6616     int vlen_enc = vector_length_encoding(this);
 6617     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6618   %}
 6619   ins_pipe( pipe_slow );
 6620 %}
 6621 
 6622 instruct vsqrtF_mem(vec dst, memory mem) %{
 6623   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6624   match(Set dst (SqrtVF (LoadVector mem)));
 6625   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6626   ins_encode %{
 6627     assert(UseAVX > 0, "required");
 6628     int vlen_enc = vector_length_encoding(this);
 6629     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6630   %}
 6631   ins_pipe( pipe_slow );
 6632 %}
 6633 
 6634 // Floating point vector sqrt
 6635 instruct vsqrtD_reg(vec dst, vec src) %{
 6636   match(Set dst (SqrtVD src));
 6637   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6638   ins_encode %{
 6639     assert(UseAVX > 0, "required");
 6640     int vlen_enc = vector_length_encoding(this);
 6641     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6642   %}
 6643   ins_pipe( pipe_slow );
 6644 %}
 6645 
 6646 instruct vsqrtD_mem(vec dst, memory mem) %{
 6647   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6648   match(Set dst (SqrtVD (LoadVector mem)));
 6649   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6650   ins_encode %{
 6651     assert(UseAVX > 0, "required");
 6652     int vlen_enc = vector_length_encoding(this);
 6653     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6654   %}
 6655   ins_pipe( pipe_slow );
 6656 %}
 6657 
 6658 // ------------------------------ Shift ---------------------------------------
 6659 
 6660 // Left and right shift count vectors are the same on x86
 6661 // (only lowest bits of xmm reg are used for count).
 6662 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6663   match(Set dst (LShiftCntV cnt));
 6664   match(Set dst (RShiftCntV cnt));
 6665   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6666   ins_encode %{
 6667     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6668   %}
 6669   ins_pipe( pipe_slow );
 6670 %}
 6671 
 6672 // Byte vector shift
 6673 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6674   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6675   match(Set dst ( LShiftVB src shift));
 6676   match(Set dst ( RShiftVB src shift));
 6677   match(Set dst (URShiftVB src shift));
 6678   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6679   format %{"vector_byte_shift $dst,$src,$shift" %}
 6680   ins_encode %{
 6681     assert(UseSSE > 3, "required");
 6682     int opcode = this->ideal_Opcode();
 6683     bool sign = (opcode != Op_URShiftVB);
 6684     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6685     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6686     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6687     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6688     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6689   %}
 6690   ins_pipe( pipe_slow );
 6691 %}
 6692 
 6693 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6694   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6695             UseAVX <= 1);
 6696   match(Set dst ( LShiftVB src shift));
 6697   match(Set dst ( RShiftVB src shift));
 6698   match(Set dst (URShiftVB src shift));
 6699   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6700   format %{"vector_byte_shift $dst,$src,$shift" %}
 6701   ins_encode %{
 6702     assert(UseSSE > 3, "required");
 6703     int opcode = this->ideal_Opcode();
 6704     bool sign = (opcode != Op_URShiftVB);
 6705     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6706     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6707     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6708     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6709     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6710     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6711     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6712     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6713     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6714   %}
 6715   ins_pipe( pipe_slow );
 6716 %}
 6717 
 6718 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6719   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6720             UseAVX > 1);
 6721   match(Set dst ( LShiftVB src shift));
 6722   match(Set dst ( RShiftVB src shift));
 6723   match(Set dst (URShiftVB src shift));
 6724   effect(TEMP dst, TEMP tmp);
 6725   format %{"vector_byte_shift $dst,$src,$shift" %}
 6726   ins_encode %{
 6727     int opcode = this->ideal_Opcode();
 6728     bool sign = (opcode != Op_URShiftVB);
 6729     int vlen_enc = Assembler::AVX_256bit;
 6730     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6731     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6732     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6733     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6734     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6735   %}
 6736   ins_pipe( pipe_slow );
 6737 %}
 6738 
 6739 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6740   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6741   match(Set dst ( LShiftVB src shift));
 6742   match(Set dst ( RShiftVB src shift));
 6743   match(Set dst (URShiftVB src shift));
 6744   effect(TEMP dst, TEMP tmp);
 6745   format %{"vector_byte_shift $dst,$src,$shift" %}
 6746   ins_encode %{
 6747     assert(UseAVX > 1, "required");
 6748     int opcode = this->ideal_Opcode();
 6749     bool sign = (opcode != Op_URShiftVB);
 6750     int vlen_enc = Assembler::AVX_256bit;
 6751     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6752     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6753     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6754     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6755     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6756     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6757     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6758     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6759     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6760   %}
 6761   ins_pipe( pipe_slow );
 6762 %}
 6763 
 6764 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6765   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6766   match(Set dst ( LShiftVB src shift));
 6767   match(Set dst  (RShiftVB src shift));
 6768   match(Set dst (URShiftVB src shift));
 6769   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6770   format %{"vector_byte_shift $dst,$src,$shift" %}
 6771   ins_encode %{
 6772     assert(UseAVX > 2, "required");
 6773     int opcode = this->ideal_Opcode();
 6774     bool sign = (opcode != Op_URShiftVB);
 6775     int vlen_enc = Assembler::AVX_512bit;
 6776     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6777     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6778     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6779     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6780     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6781     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6782     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6783     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6784     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6785     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6786     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6787     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6788   %}
 6789   ins_pipe( pipe_slow );
 6790 %}
 6791 
 6792 // Shorts vector logical right shift produces incorrect Java result
 6793 // for negative data because java code convert short value into int with
 6794 // sign extension before a shift. But char vectors are fine since chars are
 6795 // unsigned values.
 6796 // Shorts/Chars vector left shift
 6797 instruct vshiftS(vec dst, vec src, vec shift) %{
 6798   predicate(!n->as_ShiftV()->is_var_shift());
 6799   match(Set dst ( LShiftVS src shift));
 6800   match(Set dst ( RShiftVS src shift));
 6801   match(Set dst (URShiftVS src shift));
 6802   effect(TEMP dst, USE src, USE shift);
 6803   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6804   ins_encode %{
 6805     int opcode = this->ideal_Opcode();
 6806     if (UseAVX > 0) {
 6807       int vlen_enc = vector_length_encoding(this);
 6808       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6809     } else {
 6810       int vlen = Matcher::vector_length(this);
 6811       if (vlen == 2) {
 6812         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6813         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6814       } else if (vlen == 4) {
 6815         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6816         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6817       } else {
 6818         assert (vlen == 8, "sanity");
 6819         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6820         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6821       }
 6822     }
 6823   %}
 6824   ins_pipe( pipe_slow );
 6825 %}
 6826 
 6827 // Integers vector left shift
 6828 instruct vshiftI(vec dst, vec src, vec shift) %{
 6829   predicate(!n->as_ShiftV()->is_var_shift());
 6830   match(Set dst ( LShiftVI src shift));
 6831   match(Set dst ( RShiftVI src shift));
 6832   match(Set dst (URShiftVI src shift));
 6833   effect(TEMP dst, USE src, USE shift);
 6834   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6835   ins_encode %{
 6836     int opcode = this->ideal_Opcode();
 6837     if (UseAVX > 0) {
 6838       int vlen_enc = vector_length_encoding(this);
 6839       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6840     } else {
 6841       int vlen = Matcher::vector_length(this);
 6842       if (vlen == 2) {
 6843         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6844         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6845       } else {
 6846         assert(vlen == 4, "sanity");
 6847         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6848         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6849       }
 6850     }
 6851   %}
 6852   ins_pipe( pipe_slow );
 6853 %}
 6854 
 6855 // Integers vector left constant shift
 6856 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6857   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6858   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6859   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6860   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6861   ins_encode %{
 6862     int opcode = this->ideal_Opcode();
 6863     if (UseAVX > 0) {
 6864       int vector_len = vector_length_encoding(this);
 6865       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6866     } else {
 6867       int vlen = Matcher::vector_length(this);
 6868       if (vlen == 2) {
 6869         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6870         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6871       } else {
 6872         assert(vlen == 4, "sanity");
 6873         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6874         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6875       }
 6876     }
 6877   %}
 6878   ins_pipe( pipe_slow );
 6879 %}
 6880 
 6881 // Longs vector shift
 6882 instruct vshiftL(vec dst, vec src, vec shift) %{
 6883   predicate(!n->as_ShiftV()->is_var_shift());
 6884   match(Set dst ( LShiftVL src shift));
 6885   match(Set dst (URShiftVL src shift));
 6886   effect(TEMP dst, USE src, USE shift);
 6887   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6888   ins_encode %{
 6889     int opcode = this->ideal_Opcode();
 6890     if (UseAVX > 0) {
 6891       int vlen_enc = vector_length_encoding(this);
 6892       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6893     } else {
 6894       assert(Matcher::vector_length(this) == 2, "");
 6895       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6896       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6897     }
 6898   %}
 6899   ins_pipe( pipe_slow );
 6900 %}
 6901 
 6902 // Longs vector constant shift
 6903 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6904   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6905   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6906   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6907   ins_encode %{
 6908     int opcode = this->ideal_Opcode();
 6909     if (UseAVX > 0) {
 6910       int vector_len = vector_length_encoding(this);
 6911       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6912     } else {
 6913       assert(Matcher::vector_length(this) == 2, "");
 6914       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6915       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6916     }
 6917   %}
 6918   ins_pipe( pipe_slow );
 6919 %}
 6920 
 6921 // -------------------ArithmeticRightShift -----------------------------------
 6922 // Long vector arithmetic right shift
 6923 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6924   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6925   match(Set dst (RShiftVL src shift));
 6926   effect(TEMP dst, TEMP tmp);
 6927   format %{ "vshiftq $dst,$src,$shift" %}
 6928   ins_encode %{
 6929     uint vlen = Matcher::vector_length(this);
 6930     if (vlen == 2) {
 6931       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6932       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6933       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6934       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6935       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6936       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6937     } else {
 6938       assert(vlen == 4, "sanity");
 6939       assert(UseAVX > 1, "required");
 6940       int vlen_enc = Assembler::AVX_256bit;
 6941       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6942       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6943       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6944       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6945       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6946     }
 6947   %}
 6948   ins_pipe( pipe_slow );
 6949 %}
 6950 
 6951 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6952   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6953   match(Set dst (RShiftVL src shift));
 6954   format %{ "vshiftq $dst,$src,$shift" %}
 6955   ins_encode %{
 6956     int vlen_enc = vector_length_encoding(this);
 6957     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6958   %}
 6959   ins_pipe( pipe_slow );
 6960 %}
 6961 
 6962 // ------------------- Variable Shift -----------------------------
 6963 // Byte variable shift
 6964 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6965   predicate(Matcher::vector_length(n) <= 8 &&
 6966             n->as_ShiftV()->is_var_shift() &&
 6967             !VM_Version::supports_avx512bw());
 6968   match(Set dst ( LShiftVB src shift));
 6969   match(Set dst ( RShiftVB src shift));
 6970   match(Set dst (URShiftVB src shift));
 6971   effect(TEMP dst, TEMP vtmp);
 6972   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6973   ins_encode %{
 6974     assert(UseAVX >= 2, "required");
 6975 
 6976     int opcode = this->ideal_Opcode();
 6977     int vlen_enc = Assembler::AVX_128bit;
 6978     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6979     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6980   %}
 6981   ins_pipe( pipe_slow );
 6982 %}
 6983 
 6984 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6985   predicate(Matcher::vector_length(n) == 16 &&
 6986             n->as_ShiftV()->is_var_shift() &&
 6987             !VM_Version::supports_avx512bw());
 6988   match(Set dst ( LShiftVB src shift));
 6989   match(Set dst ( RShiftVB src shift));
 6990   match(Set dst (URShiftVB src shift));
 6991   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6992   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6993   ins_encode %{
 6994     assert(UseAVX >= 2, "required");
 6995 
 6996     int opcode = this->ideal_Opcode();
 6997     int vlen_enc = Assembler::AVX_128bit;
 6998     // Shift lower half and get word result in dst
 6999     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7000 
 7001     // Shift upper half and get word result in vtmp1
 7002     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7003     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7004     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7005 
 7006     // Merge and down convert the two word results to byte in dst
 7007     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7008   %}
 7009   ins_pipe( pipe_slow );
 7010 %}
 7011 
 7012 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7013   predicate(Matcher::vector_length(n) == 32 &&
 7014             n->as_ShiftV()->is_var_shift() &&
 7015             !VM_Version::supports_avx512bw());
 7016   match(Set dst ( LShiftVB src shift));
 7017   match(Set dst ( RShiftVB src shift));
 7018   match(Set dst (URShiftVB src shift));
 7019   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7020   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7021   ins_encode %{
 7022     assert(UseAVX >= 2, "required");
 7023 
 7024     int opcode = this->ideal_Opcode();
 7025     int vlen_enc = Assembler::AVX_128bit;
 7026     // Process lower 128 bits and get result in dst
 7027     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7028     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7029     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7030     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7031     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7032 
 7033     // Process higher 128 bits and get result in vtmp3
 7034     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7035     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7036     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7037     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7038     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7039     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7040     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7041 
 7042     // Merge the two results in dst
 7043     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7044   %}
 7045   ins_pipe( pipe_slow );
 7046 %}
 7047 
 7048 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7049   predicate(Matcher::vector_length(n) <= 32 &&
 7050             n->as_ShiftV()->is_var_shift() &&
 7051             VM_Version::supports_avx512bw());
 7052   match(Set dst ( LShiftVB src shift));
 7053   match(Set dst ( RShiftVB src shift));
 7054   match(Set dst (URShiftVB src shift));
 7055   effect(TEMP dst, TEMP vtmp);
 7056   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7057   ins_encode %{
 7058     assert(UseAVX > 2, "required");
 7059 
 7060     int opcode = this->ideal_Opcode();
 7061     int vlen_enc = vector_length_encoding(this);
 7062     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7063   %}
 7064   ins_pipe( pipe_slow );
 7065 %}
 7066 
 7067 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7068   predicate(Matcher::vector_length(n) == 64 &&
 7069             n->as_ShiftV()->is_var_shift() &&
 7070             VM_Version::supports_avx512bw());
 7071   match(Set dst ( LShiftVB src shift));
 7072   match(Set dst ( RShiftVB src shift));
 7073   match(Set dst (URShiftVB src shift));
 7074   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7075   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7076   ins_encode %{
 7077     assert(UseAVX > 2, "required");
 7078 
 7079     int opcode = this->ideal_Opcode();
 7080     int vlen_enc = Assembler::AVX_256bit;
 7081     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7082     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7083     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7084     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7085     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7086   %}
 7087   ins_pipe( pipe_slow );
 7088 %}
 7089 
 7090 // Short variable shift
 7091 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7092   predicate(Matcher::vector_length(n) <= 8 &&
 7093             n->as_ShiftV()->is_var_shift() &&
 7094             !VM_Version::supports_avx512bw());
 7095   match(Set dst ( LShiftVS src shift));
 7096   match(Set dst ( RShiftVS src shift));
 7097   match(Set dst (URShiftVS src shift));
 7098   effect(TEMP dst, TEMP vtmp);
 7099   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7100   ins_encode %{
 7101     assert(UseAVX >= 2, "required");
 7102 
 7103     int opcode = this->ideal_Opcode();
 7104     bool sign = (opcode != Op_URShiftVS);
 7105     int vlen_enc = Assembler::AVX_256bit;
 7106     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7107     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7108     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7109     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7110     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7111     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7112   %}
 7113   ins_pipe( pipe_slow );
 7114 %}
 7115 
 7116 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7117   predicate(Matcher::vector_length(n) == 16 &&
 7118             n->as_ShiftV()->is_var_shift() &&
 7119             !VM_Version::supports_avx512bw());
 7120   match(Set dst ( LShiftVS src shift));
 7121   match(Set dst ( RShiftVS src shift));
 7122   match(Set dst (URShiftVS src shift));
 7123   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7124   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7125   ins_encode %{
 7126     assert(UseAVX >= 2, "required");
 7127 
 7128     int opcode = this->ideal_Opcode();
 7129     bool sign = (opcode != Op_URShiftVS);
 7130     int vlen_enc = Assembler::AVX_256bit;
 7131     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7132     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7133     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7134     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7135     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7136 
 7137     // Shift upper half, with result in dst using vtmp1 as TEMP
 7138     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7139     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7140     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7141     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7142     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7143     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7144 
 7145     // Merge lower and upper half result into dst
 7146     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7147     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7148   %}
 7149   ins_pipe( pipe_slow );
 7150 %}
 7151 
 7152 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7153   predicate(n->as_ShiftV()->is_var_shift() &&
 7154             VM_Version::supports_avx512bw());
 7155   match(Set dst ( LShiftVS src shift));
 7156   match(Set dst ( RShiftVS src shift));
 7157   match(Set dst (URShiftVS src shift));
 7158   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7159   ins_encode %{
 7160     assert(UseAVX > 2, "required");
 7161 
 7162     int opcode = this->ideal_Opcode();
 7163     int vlen_enc = vector_length_encoding(this);
 7164     if (!VM_Version::supports_avx512vl()) {
 7165       vlen_enc = Assembler::AVX_512bit;
 7166     }
 7167     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7168   %}
 7169   ins_pipe( pipe_slow );
 7170 %}
 7171 
 7172 //Integer variable shift
 7173 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7174   predicate(n->as_ShiftV()->is_var_shift());
 7175   match(Set dst ( LShiftVI src shift));
 7176   match(Set dst ( RShiftVI src shift));
 7177   match(Set dst (URShiftVI src shift));
 7178   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7179   ins_encode %{
 7180     assert(UseAVX >= 2, "required");
 7181 
 7182     int opcode = this->ideal_Opcode();
 7183     int vlen_enc = vector_length_encoding(this);
 7184     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7185   %}
 7186   ins_pipe( pipe_slow );
 7187 %}
 7188 
 7189 //Long variable shift
 7190 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7191   predicate(n->as_ShiftV()->is_var_shift());
 7192   match(Set dst ( LShiftVL src shift));
 7193   match(Set dst (URShiftVL src shift));
 7194   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7195   ins_encode %{
 7196     assert(UseAVX >= 2, "required");
 7197 
 7198     int opcode = this->ideal_Opcode();
 7199     int vlen_enc = vector_length_encoding(this);
 7200     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7201   %}
 7202   ins_pipe( pipe_slow );
 7203 %}
 7204 
 7205 //Long variable right shift arithmetic
 7206 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7207   predicate(Matcher::vector_length(n) <= 4 &&
 7208             n->as_ShiftV()->is_var_shift() &&
 7209             UseAVX == 2);
 7210   match(Set dst (RShiftVL src shift));
 7211   effect(TEMP dst, TEMP vtmp);
 7212   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7213   ins_encode %{
 7214     int opcode = this->ideal_Opcode();
 7215     int vlen_enc = vector_length_encoding(this);
 7216     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7217                  $vtmp$$XMMRegister);
 7218   %}
 7219   ins_pipe( pipe_slow );
 7220 %}
 7221 
 7222 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7223   predicate(n->as_ShiftV()->is_var_shift() &&
 7224             UseAVX > 2);
 7225   match(Set dst (RShiftVL src shift));
 7226   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7227   ins_encode %{
 7228     int opcode = this->ideal_Opcode();
 7229     int vlen_enc = vector_length_encoding(this);
 7230     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7231   %}
 7232   ins_pipe( pipe_slow );
 7233 %}
 7234 
 7235 // --------------------------------- AND --------------------------------------
 7236 
 7237 instruct vand(vec dst, vec src) %{
 7238   predicate(UseAVX == 0);
 7239   match(Set dst (AndV dst src));
 7240   format %{ "pand    $dst,$src\t! and vectors" %}
 7241   ins_encode %{
 7242     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7243   %}
 7244   ins_pipe( pipe_slow );
 7245 %}
 7246 
 7247 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7248   predicate(UseAVX > 0);
 7249   match(Set dst (AndV src1 src2));
 7250   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7251   ins_encode %{
 7252     int vlen_enc = vector_length_encoding(this);
 7253     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7254   %}
 7255   ins_pipe( pipe_slow );
 7256 %}
 7257 
 7258 instruct vand_mem(vec dst, vec src, memory mem) %{
 7259   predicate((UseAVX > 0) &&
 7260             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7261   match(Set dst (AndV src (LoadVector mem)));
 7262   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7263   ins_encode %{
 7264     int vlen_enc = vector_length_encoding(this);
 7265     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7266   %}
 7267   ins_pipe( pipe_slow );
 7268 %}
 7269 
 7270 // --------------------------------- OR ---------------------------------------
 7271 
 7272 instruct vor(vec dst, vec src) %{
 7273   predicate(UseAVX == 0);
 7274   match(Set dst (OrV dst src));
 7275   format %{ "por     $dst,$src\t! or vectors" %}
 7276   ins_encode %{
 7277     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7278   %}
 7279   ins_pipe( pipe_slow );
 7280 %}
 7281 
 7282 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7283   predicate(UseAVX > 0);
 7284   match(Set dst (OrV src1 src2));
 7285   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7286   ins_encode %{
 7287     int vlen_enc = vector_length_encoding(this);
 7288     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7289   %}
 7290   ins_pipe( pipe_slow );
 7291 %}
 7292 
 7293 instruct vor_mem(vec dst, vec src, memory mem) %{
 7294   predicate((UseAVX > 0) &&
 7295             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7296   match(Set dst (OrV src (LoadVector mem)));
 7297   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7298   ins_encode %{
 7299     int vlen_enc = vector_length_encoding(this);
 7300     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7301   %}
 7302   ins_pipe( pipe_slow );
 7303 %}
 7304 
 7305 // --------------------------------- XOR --------------------------------------
 7306 
 7307 instruct vxor(vec dst, vec src) %{
 7308   predicate(UseAVX == 0);
 7309   match(Set dst (XorV dst src));
 7310   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7311   ins_encode %{
 7312     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7313   %}
 7314   ins_pipe( pipe_slow );
 7315 %}
 7316 
 7317 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7318   predicate(UseAVX > 0);
 7319   match(Set dst (XorV src1 src2));
 7320   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7321   ins_encode %{
 7322     int vlen_enc = vector_length_encoding(this);
 7323     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7324   %}
 7325   ins_pipe( pipe_slow );
 7326 %}
 7327 
 7328 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7329   predicate((UseAVX > 0) &&
 7330             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7331   match(Set dst (XorV src (LoadVector mem)));
 7332   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7333   ins_encode %{
 7334     int vlen_enc = vector_length_encoding(this);
 7335     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7336   %}
 7337   ins_pipe( pipe_slow );
 7338 %}
 7339 
 7340 // --------------------------------- VectorCast --------------------------------------
 7341 
 7342 instruct vcastBtoX(vec dst, vec src) %{
 7343   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7344   match(Set dst (VectorCastB2X src));
 7345   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7346   ins_encode %{
 7347     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7348     int vlen_enc = vector_length_encoding(this);
 7349     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7350   %}
 7351   ins_pipe( pipe_slow );
 7352 %}
 7353 
 7354 instruct vcastBtoD(legVec dst, legVec src) %{
 7355   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7356   match(Set dst (VectorCastB2X src));
 7357   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7358   ins_encode %{
 7359     int vlen_enc = vector_length_encoding(this);
 7360     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7361   %}
 7362   ins_pipe( pipe_slow );
 7363 %}
 7364 
 7365 instruct castStoX(vec dst, vec src) %{
 7366   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7367             Matcher::vector_length(n->in(1)) <= 8 && // src
 7368             Matcher::vector_element_basic_type(n) == T_BYTE);
 7369   match(Set dst (VectorCastS2X src));
 7370   format %{ "vector_cast_s2x $dst,$src" %}
 7371   ins_encode %{
 7372     assert(UseAVX > 0, "required");
 7373 
 7374     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7375     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7376   %}
 7377   ins_pipe( pipe_slow );
 7378 %}
 7379 
 7380 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7381   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7382             Matcher::vector_length(n->in(1)) == 16 && // src
 7383             Matcher::vector_element_basic_type(n) == T_BYTE);
 7384   effect(TEMP dst, TEMP vtmp);
 7385   match(Set dst (VectorCastS2X src));
 7386   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7387   ins_encode %{
 7388     assert(UseAVX > 0, "required");
 7389 
 7390     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7391     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7392     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7393     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7394   %}
 7395   ins_pipe( pipe_slow );
 7396 %}
 7397 
 7398 instruct vcastStoX_evex(vec dst, vec src) %{
 7399   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7400             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7401   match(Set dst (VectorCastS2X src));
 7402   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7403   ins_encode %{
 7404     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7405     int src_vlen_enc = vector_length_encoding(this, $src);
 7406     int vlen_enc = vector_length_encoding(this);
 7407     switch (to_elem_bt) {
 7408       case T_BYTE:
 7409         if (!VM_Version::supports_avx512vl()) {
 7410           vlen_enc = Assembler::AVX_512bit;
 7411         }
 7412         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7413         break;
 7414       case T_INT:
 7415         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7416         break;
 7417       case T_FLOAT:
 7418         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7419         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7420         break;
 7421       case T_LONG:
 7422         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7423         break;
 7424       case T_DOUBLE: {
 7425         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7426         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7427         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7428         break;
 7429       }
 7430       default:
 7431         ShouldNotReachHere();
 7432     }
 7433   %}
 7434   ins_pipe( pipe_slow );
 7435 %}
 7436 
 7437 instruct castItoX(vec dst, vec src) %{
 7438   predicate(UseAVX <= 2 &&
 7439             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7440             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7441   match(Set dst (VectorCastI2X src));
 7442   format %{ "vector_cast_i2x $dst,$src" %}
 7443   ins_encode %{
 7444     assert(UseAVX > 0, "required");
 7445 
 7446     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7447     int vlen_enc = vector_length_encoding(this, $src);
 7448 
 7449     if (to_elem_bt == T_BYTE) {
 7450       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7451       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7452       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7453     } else {
 7454       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7455       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7456       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7457     }
 7458   %}
 7459   ins_pipe( pipe_slow );
 7460 %}
 7461 
 7462 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7463   predicate(UseAVX <= 2 &&
 7464             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7465             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7466   match(Set dst (VectorCastI2X src));
 7467   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7468   effect(TEMP dst, TEMP vtmp);
 7469   ins_encode %{
 7470     assert(UseAVX > 0, "required");
 7471 
 7472     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7473     int vlen_enc = vector_length_encoding(this, $src);
 7474 
 7475     if (to_elem_bt == T_BYTE) {
 7476       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7477       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7478       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7479       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7480     } else {
 7481       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7482       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7483       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7484       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7485     }
 7486   %}
 7487   ins_pipe( pipe_slow );
 7488 %}
 7489 
 7490 instruct vcastItoX_evex(vec dst, vec src) %{
 7491   predicate(UseAVX > 2 ||
 7492             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7493   match(Set dst (VectorCastI2X src));
 7494   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7495   ins_encode %{
 7496     assert(UseAVX > 0, "required");
 7497 
 7498     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7499     int src_vlen_enc = vector_length_encoding(this, $src);
 7500     int dst_vlen_enc = vector_length_encoding(this);
 7501     switch (dst_elem_bt) {
 7502       case T_BYTE:
 7503         if (!VM_Version::supports_avx512vl()) {
 7504           src_vlen_enc = Assembler::AVX_512bit;
 7505         }
 7506         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7507         break;
 7508       case T_SHORT:
 7509         if (!VM_Version::supports_avx512vl()) {
 7510           src_vlen_enc = Assembler::AVX_512bit;
 7511         }
 7512         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7513         break;
 7514       case T_FLOAT:
 7515         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7516         break;
 7517       case T_LONG:
 7518         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7519         break;
 7520       case T_DOUBLE:
 7521         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7522         break;
 7523       default:
 7524         ShouldNotReachHere();
 7525     }
 7526   %}
 7527   ins_pipe( pipe_slow );
 7528 %}
 7529 
 7530 instruct vcastLtoBS(vec dst, vec src) %{
 7531   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7532             UseAVX <= 2);
 7533   match(Set dst (VectorCastL2X src));
 7534   format %{ "vector_cast_l2x  $dst,$src" %}
 7535   ins_encode %{
 7536     assert(UseAVX > 0, "required");
 7537 
 7538     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7539     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7540     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7541                                                       : ExternalAddress(vector_int_to_short_mask());
 7542     if (vlen <= 16) {
 7543       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7544       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7545       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7546     } else {
 7547       assert(vlen <= 32, "required");
 7548       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7549       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7550       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7551       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7552     }
 7553     if (to_elem_bt == T_BYTE) {
 7554       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7555     }
 7556   %}
 7557   ins_pipe( pipe_slow );
 7558 %}
 7559 
 7560 instruct vcastLtoX_evex(vec dst, vec src) %{
 7561   predicate(UseAVX > 2 ||
 7562             (Matcher::vector_element_basic_type(n) == T_INT ||
 7563              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7564              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7565   match(Set dst (VectorCastL2X src));
 7566   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7567   ins_encode %{
 7568     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7569     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7570     int vlen_enc = vector_length_encoding(this, $src);
 7571     switch (to_elem_bt) {
 7572       case T_BYTE:
 7573         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7574           vlen_enc = Assembler::AVX_512bit;
 7575         }
 7576         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7577         break;
 7578       case T_SHORT:
 7579         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7580           vlen_enc = Assembler::AVX_512bit;
 7581         }
 7582         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7583         break;
 7584       case T_INT:
 7585         if (vlen == 8) {
 7586           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7587             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7588           }
 7589         } else if (vlen == 16) {
 7590           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7591         } else if (vlen == 32) {
 7592           if (UseAVX > 2) {
 7593             if (!VM_Version::supports_avx512vl()) {
 7594               vlen_enc = Assembler::AVX_512bit;
 7595             }
 7596             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7597           } else {
 7598             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7599             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7600           }
 7601         } else { // vlen == 64
 7602           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7603         }
 7604         break;
 7605       case T_FLOAT:
 7606         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7607         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7608         break;
 7609       case T_DOUBLE:
 7610         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7611         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7612         break;
 7613 
 7614       default: assert(false, "%s", type2name(to_elem_bt));
 7615     }
 7616   %}
 7617   ins_pipe( pipe_slow );
 7618 %}
 7619 
 7620 instruct vcastFtoD_reg(vec dst, vec src) %{
 7621   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7622   match(Set dst (VectorCastF2X src));
 7623   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7624   ins_encode %{
 7625     int vlen_enc = vector_length_encoding(this);
 7626     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7627   %}
 7628   ins_pipe( pipe_slow );
 7629 %}
 7630 
 7631 
 7632 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7633   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7634             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7635   match(Set dst (VectorCastF2X src));
 7636   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7637   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7638   ins_encode %{
 7639     int vlen_enc = vector_length_encoding(this, $src);
 7640     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7641     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7642     // 32 bit addresses for register indirect addressing mode since stub constants
 7643     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7644     // However, targets are free to increase this limit, but having a large code cache size
 7645     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7646     // cap we save a temporary register allocation which in limiting case can prevent
 7647     // spilling in high register pressure blocks.
 7648     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7649                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7650                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7651   %}
 7652   ins_pipe( pipe_slow );
 7653 %}
 7654 
 7655 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7656   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7657             is_integral_type(Matcher::vector_element_basic_type(n)));
 7658   match(Set dst (VectorCastF2X src));
 7659   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7660   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7661   ins_encode %{
 7662     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7663     if (to_elem_bt == T_LONG) {
 7664       int vlen_enc = vector_length_encoding(this);
 7665       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7666                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7667                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7668     } else {
 7669       int vlen_enc = vector_length_encoding(this, $src);
 7670       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7671                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7672                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7673     }
 7674   %}
 7675   ins_pipe( pipe_slow );
 7676 %}
 7677 
 7678 instruct vcastDtoF_reg(vec dst, vec src) %{
 7679   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7680   match(Set dst (VectorCastD2X src));
 7681   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7682   ins_encode %{
 7683     int vlen_enc = vector_length_encoding(this, $src);
 7684     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7685   %}
 7686   ins_pipe( pipe_slow );
 7687 %}
 7688 
 7689 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7690   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7691             is_integral_type(Matcher::vector_element_basic_type(n)));
 7692   match(Set dst (VectorCastD2X src));
 7693   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7694   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7695   ins_encode %{
 7696     int vlen_enc = vector_length_encoding(this, $src);
 7697     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7698     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7699                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7700                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7701   %}
 7702   ins_pipe( pipe_slow );
 7703 %}
 7704 
 7705 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7706   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7707             is_integral_type(Matcher::vector_element_basic_type(n)));
 7708   match(Set dst (VectorCastD2X src));
 7709   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7710   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7711   ins_encode %{
 7712     int vlen_enc = vector_length_encoding(this, $src);
 7713     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7714     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7715                               ExternalAddress(vector_float_signflip());
 7716     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7717                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7718   %}
 7719   ins_pipe( pipe_slow );
 7720 %}
 7721 
 7722 instruct vucast(vec dst, vec src) %{
 7723   match(Set dst (VectorUCastB2X src));
 7724   match(Set dst (VectorUCastS2X src));
 7725   match(Set dst (VectorUCastI2X src));
 7726   format %{ "vector_ucast $dst,$src\t!" %}
 7727   ins_encode %{
 7728     assert(UseAVX > 0, "required");
 7729 
 7730     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7731     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7732     int vlen_enc = vector_length_encoding(this);
 7733     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7734   %}
 7735   ins_pipe( pipe_slow );
 7736 %}
 7737 
 7738 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7739   predicate(!VM_Version::supports_avx512vl() &&
 7740             Matcher::vector_length_in_bytes(n) < 64 &&
 7741             Matcher::vector_element_basic_type(n) == T_INT);
 7742   match(Set dst (RoundVF src));
 7743   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7744   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7745   ins_encode %{
 7746     int vlen_enc = vector_length_encoding(this);
 7747     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7748     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7749                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7750                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7751   %}
 7752   ins_pipe( pipe_slow );
 7753 %}
 7754 
 7755 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7756   predicate((VM_Version::supports_avx512vl() ||
 7757              Matcher::vector_length_in_bytes(n) == 64) &&
 7758              Matcher::vector_element_basic_type(n) == T_INT);
 7759   match(Set dst (RoundVF src));
 7760   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7761   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7762   ins_encode %{
 7763     int vlen_enc = vector_length_encoding(this);
 7764     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7765     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7766                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7767                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7768   %}
 7769   ins_pipe( pipe_slow );
 7770 %}
 7771 
 7772 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7773   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7774   match(Set dst (RoundVD src));
 7775   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7776   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7777   ins_encode %{
 7778     int vlen_enc = vector_length_encoding(this);
 7779     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7780     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7781                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7782                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7783   %}
 7784   ins_pipe( pipe_slow );
 7785 %}
 7786 
 7787 // --------------------------------- VectorMaskCmp --------------------------------------
 7788 
 7789 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7790   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7791             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7792             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7793             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7794   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7795   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7796   ins_encode %{
 7797     int vlen_enc = vector_length_encoding(this, $src1);
 7798     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7799     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7800       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7801     } else {
 7802       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7803     }
 7804   %}
 7805   ins_pipe( pipe_slow );
 7806 %}
 7807 
 7808 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7809   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7810             n->bottom_type()->isa_vectmask() == nullptr &&
 7811             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7812   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7813   effect(TEMP ktmp);
 7814   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7815   ins_encode %{
 7816     int vlen_enc = Assembler::AVX_512bit;
 7817     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7818     KRegister mask = k0; // The comparison itself is not being masked.
 7819     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7820       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7821       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7822     } else {
 7823       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7824       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7825     }
 7826   %}
 7827   ins_pipe( pipe_slow );
 7828 %}
 7829 
 7830 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7831   predicate(n->bottom_type()->isa_vectmask() &&
 7832             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7833   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7834   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7835   ins_encode %{
 7836     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7837     int vlen_enc = vector_length_encoding(this, $src1);
 7838     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7839     KRegister mask = k0; // The comparison itself is not being masked.
 7840     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7841       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7842     } else {
 7843       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7844     }
 7845   %}
 7846   ins_pipe( pipe_slow );
 7847 %}
 7848 
 7849 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7850   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7851             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7852             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7853             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7854             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7855             (n->in(2)->get_int() == BoolTest::eq ||
 7856              n->in(2)->get_int() == BoolTest::lt ||
 7857              n->in(2)->get_int() == BoolTest::gt)); // cond
 7858   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7859   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7860   ins_encode %{
 7861     int vlen_enc = vector_length_encoding(this, $src1);
 7862     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7863     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7864     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7865   %}
 7866   ins_pipe( pipe_slow );
 7867 %}
 7868 
 7869 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7870   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7871             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7872             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7873             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7874             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7875             (n->in(2)->get_int() == BoolTest::ne ||
 7876              n->in(2)->get_int() == BoolTest::le ||
 7877              n->in(2)->get_int() == BoolTest::ge)); // cond
 7878   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7879   effect(TEMP dst, TEMP xtmp);
 7880   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7881   ins_encode %{
 7882     int vlen_enc = vector_length_encoding(this, $src1);
 7883     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7884     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7885     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7886   %}
 7887   ins_pipe( pipe_slow );
 7888 %}
 7889 
 7890 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7891   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7892             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7893             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7894             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7895             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7896   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7897   effect(TEMP dst, TEMP xtmp);
 7898   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7899   ins_encode %{
 7900     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7901     int vlen_enc = vector_length_encoding(this, $src1);
 7902     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7903     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7904 
 7905     if (vlen_enc == Assembler::AVX_128bit) {
 7906       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7907     } else {
 7908       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7909     }
 7910     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7911     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7912     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7913   %}
 7914   ins_pipe( pipe_slow );
 7915 %}
 7916 
 7917 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7918   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7919              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7920              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7921   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7922   effect(TEMP ktmp);
 7923   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7924   ins_encode %{
 7925     assert(UseAVX > 2, "required");
 7926 
 7927     int vlen_enc = vector_length_encoding(this, $src1);
 7928     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7929     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7930     KRegister mask = k0; // The comparison itself is not being masked.
 7931     bool merge = false;
 7932     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7933 
 7934     switch (src1_elem_bt) {
 7935       case T_INT: {
 7936         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7937         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7938         break;
 7939       }
 7940       case T_LONG: {
 7941         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7942         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7943         break;
 7944       }
 7945       default: assert(false, "%s", type2name(src1_elem_bt));
 7946     }
 7947   %}
 7948   ins_pipe( pipe_slow );
 7949 %}
 7950 
 7951 
 7952 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7953   predicate(n->bottom_type()->isa_vectmask() &&
 7954             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7955   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7956   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7957   ins_encode %{
 7958     assert(UseAVX > 2, "required");
 7959     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7960 
 7961     int vlen_enc = vector_length_encoding(this, $src1);
 7962     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7963     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7964     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7965 
 7966     // Comparison i
 7967     switch (src1_elem_bt) {
 7968       case T_BYTE: {
 7969         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7970         break;
 7971       }
 7972       case T_SHORT: {
 7973         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7974         break;
 7975       }
 7976       case T_INT: {
 7977         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7978         break;
 7979       }
 7980       case T_LONG: {
 7981         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7982         break;
 7983       }
 7984       default: assert(false, "%s", type2name(src1_elem_bt));
 7985     }
 7986   %}
 7987   ins_pipe( pipe_slow );
 7988 %}
 7989 
 7990 // Extract
 7991 
 7992 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7993   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7994   match(Set dst (ExtractI src idx));
 7995   match(Set dst (ExtractS src idx));
 7996   match(Set dst (ExtractB src idx));
 7997   format %{ "extractI $dst,$src,$idx\t!" %}
 7998   ins_encode %{
 7999     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8000 
 8001     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8002     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8003   %}
 8004   ins_pipe( pipe_slow );
 8005 %}
 8006 
 8007 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8008   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8009             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8010   match(Set dst (ExtractI src idx));
 8011   match(Set dst (ExtractS src idx));
 8012   match(Set dst (ExtractB src idx));
 8013   effect(TEMP vtmp);
 8014   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8015   ins_encode %{
 8016     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8017 
 8018     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8019     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8020     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8021   %}
 8022   ins_pipe( pipe_slow );
 8023 %}
 8024 
 8025 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8026   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8027   match(Set dst (ExtractL src idx));
 8028   format %{ "extractL $dst,$src,$idx\t!" %}
 8029   ins_encode %{
 8030     assert(UseSSE >= 4, "required");
 8031     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8032 
 8033     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8034   %}
 8035   ins_pipe( pipe_slow );
 8036 %}
 8037 
 8038 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8039   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8040             Matcher::vector_length(n->in(1)) == 8);  // src
 8041   match(Set dst (ExtractL src idx));
 8042   effect(TEMP vtmp);
 8043   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8044   ins_encode %{
 8045     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8046 
 8047     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8048     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8049   %}
 8050   ins_pipe( pipe_slow );
 8051 %}
 8052 
 8053 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8054   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8055   match(Set dst (ExtractF src idx));
 8056   effect(TEMP dst, TEMP vtmp);
 8057   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8058   ins_encode %{
 8059     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8060 
 8061     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8062   %}
 8063   ins_pipe( pipe_slow );
 8064 %}
 8065 
 8066 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8067   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8068             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8069   match(Set dst (ExtractF src idx));
 8070   effect(TEMP vtmp);
 8071   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8072   ins_encode %{
 8073     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8074 
 8075     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8076     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8077   %}
 8078   ins_pipe( pipe_slow );
 8079 %}
 8080 
 8081 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8082   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8083   match(Set dst (ExtractD src idx));
 8084   format %{ "extractD $dst,$src,$idx\t!" %}
 8085   ins_encode %{
 8086     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8087 
 8088     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8089   %}
 8090   ins_pipe( pipe_slow );
 8091 %}
 8092 
 8093 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8094   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8095             Matcher::vector_length(n->in(1)) == 8);  // src
 8096   match(Set dst (ExtractD src idx));
 8097   effect(TEMP vtmp);
 8098   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8099   ins_encode %{
 8100     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8101 
 8102     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8103     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8104   %}
 8105   ins_pipe( pipe_slow );
 8106 %}
 8107 
 8108 // --------------------------------- Vector Blend --------------------------------------
 8109 
 8110 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8111   predicate(UseAVX == 0);
 8112   match(Set dst (VectorBlend (Binary dst src) mask));
 8113   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8114   effect(TEMP tmp);
 8115   ins_encode %{
 8116     assert(UseSSE >= 4, "required");
 8117 
 8118     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8119       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8120     }
 8121     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8122   %}
 8123   ins_pipe( pipe_slow );
 8124 %}
 8125 
 8126 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8127   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8128             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8129             Matcher::vector_length_in_bytes(n) <= 32 &&
 8130             is_integral_type(Matcher::vector_element_basic_type(n)));
 8131   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8132   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8133   ins_encode %{
 8134     int vlen_enc = vector_length_encoding(this);
 8135     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8136   %}
 8137   ins_pipe( pipe_slow );
 8138 %}
 8139 
 8140 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8141   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8142             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8143             Matcher::vector_length_in_bytes(n) <= 32 &&
 8144             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8145   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8146   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8147   ins_encode %{
 8148     int vlen_enc = vector_length_encoding(this);
 8149     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8150   %}
 8151   ins_pipe( pipe_slow );
 8152 %}
 8153 
 8154 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8155   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8156             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8157             Matcher::vector_length_in_bytes(n) <= 32);
 8158   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8159   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8160   effect(TEMP vtmp, TEMP dst);
 8161   ins_encode %{
 8162     int vlen_enc = vector_length_encoding(this);
 8163     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8164     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8165     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8166   %}
 8167   ins_pipe( pipe_slow );
 8168 %}
 8169 
 8170 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8171   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8172             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8173   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8174   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8175   effect(TEMP ktmp);
 8176   ins_encode %{
 8177      int vlen_enc = Assembler::AVX_512bit;
 8178      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8179     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8180     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8181   %}
 8182   ins_pipe( pipe_slow );
 8183 %}
 8184 
 8185 
 8186 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8187   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8188             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8189              VM_Version::supports_avx512bw()));
 8190   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8191   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8192   ins_encode %{
 8193     int vlen_enc = vector_length_encoding(this);
 8194     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8195     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8196   %}
 8197   ins_pipe( pipe_slow );
 8198 %}
 8199 
 8200 // --------------------------------- ABS --------------------------------------
 8201 // a = |a|
 8202 instruct vabsB_reg(vec dst, vec src) %{
 8203   match(Set dst (AbsVB  src));
 8204   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8205   ins_encode %{
 8206     uint vlen = Matcher::vector_length(this);
 8207     if (vlen <= 16) {
 8208       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8209     } else {
 8210       int vlen_enc = vector_length_encoding(this);
 8211       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8212     }
 8213   %}
 8214   ins_pipe( pipe_slow );
 8215 %}
 8216 
 8217 instruct vabsS_reg(vec dst, vec src) %{
 8218   match(Set dst (AbsVS  src));
 8219   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8220   ins_encode %{
 8221     uint vlen = Matcher::vector_length(this);
 8222     if (vlen <= 8) {
 8223       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8224     } else {
 8225       int vlen_enc = vector_length_encoding(this);
 8226       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8227     }
 8228   %}
 8229   ins_pipe( pipe_slow );
 8230 %}
 8231 
 8232 instruct vabsI_reg(vec dst, vec src) %{
 8233   match(Set dst (AbsVI  src));
 8234   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8235   ins_encode %{
 8236     uint vlen = Matcher::vector_length(this);
 8237     if (vlen <= 4) {
 8238       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8239     } else {
 8240       int vlen_enc = vector_length_encoding(this);
 8241       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8242     }
 8243   %}
 8244   ins_pipe( pipe_slow );
 8245 %}
 8246 
 8247 instruct vabsL_reg(vec dst, vec src) %{
 8248   match(Set dst (AbsVL  src));
 8249   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8250   ins_encode %{
 8251     assert(UseAVX > 2, "required");
 8252     int vlen_enc = vector_length_encoding(this);
 8253     if (!VM_Version::supports_avx512vl()) {
 8254       vlen_enc = Assembler::AVX_512bit;
 8255     }
 8256     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8257   %}
 8258   ins_pipe( pipe_slow );
 8259 %}
 8260 
 8261 // --------------------------------- ABSNEG --------------------------------------
 8262 
 8263 instruct vabsnegF(vec dst, vec src) %{
 8264   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8265   match(Set dst (AbsVF src));
 8266   match(Set dst (NegVF src));
 8267   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8268   ins_cost(150);
 8269   ins_encode %{
 8270     int opcode = this->ideal_Opcode();
 8271     int vlen = Matcher::vector_length(this);
 8272     if (vlen == 2) {
 8273       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8274     } else {
 8275       assert(vlen == 8 || vlen == 16, "required");
 8276       int vlen_enc = vector_length_encoding(this);
 8277       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8278     }
 8279   %}
 8280   ins_pipe( pipe_slow );
 8281 %}
 8282 
 8283 instruct vabsneg4F(vec dst) %{
 8284   predicate(Matcher::vector_length(n) == 4);
 8285   match(Set dst (AbsVF dst));
 8286   match(Set dst (NegVF dst));
 8287   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8288   ins_cost(150);
 8289   ins_encode %{
 8290     int opcode = this->ideal_Opcode();
 8291     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8292   %}
 8293   ins_pipe( pipe_slow );
 8294 %}
 8295 
 8296 instruct vabsnegD(vec dst, vec src) %{
 8297   match(Set dst (AbsVD  src));
 8298   match(Set dst (NegVD  src));
 8299   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8300   ins_encode %{
 8301     int opcode = this->ideal_Opcode();
 8302     uint vlen = Matcher::vector_length(this);
 8303     if (vlen == 2) {
 8304       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8305     } else {
 8306       int vlen_enc = vector_length_encoding(this);
 8307       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8308     }
 8309   %}
 8310   ins_pipe( pipe_slow );
 8311 %}
 8312 
 8313 //------------------------------------- VectorTest --------------------------------------------
 8314 
 8315 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8316   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8317   match(Set cr (VectorTest src1 src2));
 8318   effect(TEMP vtmp);
 8319   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8320   ins_encode %{
 8321     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8322     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8323     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8324   %}
 8325   ins_pipe( pipe_slow );
 8326 %}
 8327 
 8328 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8329   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8330   match(Set cr (VectorTest src1 src2));
 8331   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8332   ins_encode %{
 8333     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8334     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8335     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8336   %}
 8337   ins_pipe( pipe_slow );
 8338 %}
 8339 
 8340 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8341   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8342              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8343             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8344   match(Set cr (VectorTest src1 src2));
 8345   effect(TEMP tmp);
 8346   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8347   ins_encode %{
 8348     uint masklen = Matcher::vector_length(this, $src1);
 8349     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8350     __ andl($tmp$$Register, (1 << masklen) - 1);
 8351     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8352   %}
 8353   ins_pipe( pipe_slow );
 8354 %}
 8355 
 8356 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8357   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8358              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8359             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8360   match(Set cr (VectorTest src1 src2));
 8361   effect(TEMP tmp);
 8362   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8363   ins_encode %{
 8364     uint masklen = Matcher::vector_length(this, $src1);
 8365     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8366     __ andl($tmp$$Register, (1 << masklen) - 1);
 8367   %}
 8368   ins_pipe( pipe_slow );
 8369 %}
 8370 
 8371 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8372   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8373             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8374   match(Set cr (VectorTest src1 src2));
 8375   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8376   ins_encode %{
 8377     uint masklen = Matcher::vector_length(this, $src1);
 8378     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8379   %}
 8380   ins_pipe( pipe_slow );
 8381 %}
 8382 
 8383 //------------------------------------- LoadMask --------------------------------------------
 8384 
 8385 instruct loadMask(legVec dst, legVec src) %{
 8386   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8387   match(Set dst (VectorLoadMask src));
 8388   effect(TEMP dst);
 8389   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8390   ins_encode %{
 8391     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8392     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8393     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8394   %}
 8395   ins_pipe( pipe_slow );
 8396 %}
 8397 
 8398 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8399   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8400   match(Set dst (VectorLoadMask src));
 8401   effect(TEMP xtmp);
 8402   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8403   ins_encode %{
 8404     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8405                         true, Assembler::AVX_512bit);
 8406   %}
 8407   ins_pipe( pipe_slow );
 8408 %}
 8409 
 8410 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8411   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8412   match(Set dst (VectorLoadMask src));
 8413   effect(TEMP xtmp);
 8414   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8415   ins_encode %{
 8416     int vlen_enc = vector_length_encoding(in(1));
 8417     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8418                         false, vlen_enc);
 8419   %}
 8420   ins_pipe( pipe_slow );
 8421 %}
 8422 
 8423 //------------------------------------- StoreMask --------------------------------------------
 8424 
 8425 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8426   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8427   match(Set dst (VectorStoreMask src size));
 8428   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8429   ins_encode %{
 8430     int vlen = Matcher::vector_length(this);
 8431     if (vlen <= 16 && UseAVX <= 2) {
 8432       assert(UseSSE >= 3, "required");
 8433       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8434     } else {
 8435       assert(UseAVX > 0, "required");
 8436       int src_vlen_enc = vector_length_encoding(this, $src);
 8437       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8438     }
 8439   %}
 8440   ins_pipe( pipe_slow );
 8441 %}
 8442 
 8443 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8444   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8445   match(Set dst (VectorStoreMask src size));
 8446   effect(TEMP_DEF dst, TEMP xtmp);
 8447   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8448   ins_encode %{
 8449     int vlen_enc = Assembler::AVX_128bit;
 8450     int vlen = Matcher::vector_length(this);
 8451     if (vlen <= 8) {
 8452       assert(UseSSE >= 3, "required");
 8453       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8454       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8455       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8456     } else {
 8457       assert(UseAVX > 0, "required");
 8458       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8459       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8460       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8461     }
 8462   %}
 8463   ins_pipe( pipe_slow );
 8464 %}
 8465 
 8466 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8467   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8468   match(Set dst (VectorStoreMask src size));
 8469   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8470   effect(TEMP_DEF dst, TEMP xtmp);
 8471   ins_encode %{
 8472     int vlen_enc = Assembler::AVX_128bit;
 8473     int vlen = Matcher::vector_length(this);
 8474     if (vlen <= 4) {
 8475       assert(UseSSE >= 3, "required");
 8476       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8477       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8478       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8479       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8480     } else {
 8481       assert(UseAVX > 0, "required");
 8482       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8483       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8484       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8485       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8486       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8487     }
 8488   %}
 8489   ins_pipe( pipe_slow );
 8490 %}
 8491 
 8492 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8493   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8494   match(Set dst (VectorStoreMask src size));
 8495   effect(TEMP_DEF dst, TEMP xtmp);
 8496   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8497   ins_encode %{
 8498     assert(UseSSE >= 3, "required");
 8499     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8500     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8501     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8502     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8503     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8504   %}
 8505   ins_pipe( pipe_slow );
 8506 %}
 8507 
 8508 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8509   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8510   match(Set dst (VectorStoreMask src size));
 8511   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8512   effect(TEMP_DEF dst, TEMP vtmp);
 8513   ins_encode %{
 8514     int vlen_enc = Assembler::AVX_128bit;
 8515     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8516     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8517     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8518     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8519     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8520     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8521     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8522   %}
 8523   ins_pipe( pipe_slow );
 8524 %}
 8525 
 8526 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8527   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8528   match(Set dst (VectorStoreMask src size));
 8529   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8530   ins_encode %{
 8531     int src_vlen_enc = vector_length_encoding(this, $src);
 8532     int dst_vlen_enc = vector_length_encoding(this);
 8533     if (!VM_Version::supports_avx512vl()) {
 8534       src_vlen_enc = Assembler::AVX_512bit;
 8535     }
 8536     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8537     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8538   %}
 8539   ins_pipe( pipe_slow );
 8540 %}
 8541 
 8542 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8543   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8544   match(Set dst (VectorStoreMask src size));
 8545   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8546   ins_encode %{
 8547     int src_vlen_enc = vector_length_encoding(this, $src);
 8548     int dst_vlen_enc = vector_length_encoding(this);
 8549     if (!VM_Version::supports_avx512vl()) {
 8550       src_vlen_enc = Assembler::AVX_512bit;
 8551     }
 8552     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8553     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8554   %}
 8555   ins_pipe( pipe_slow );
 8556 %}
 8557 
 8558 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8559   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8560   match(Set dst (VectorStoreMask mask size));
 8561   effect(TEMP_DEF dst);
 8562   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8563   ins_encode %{
 8564     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8565     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8566                  false, Assembler::AVX_512bit, noreg);
 8567     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8568   %}
 8569   ins_pipe( pipe_slow );
 8570 %}
 8571 
 8572 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8573   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8574   match(Set dst (VectorStoreMask mask size));
 8575   effect(TEMP_DEF dst);
 8576   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8577   ins_encode %{
 8578     int dst_vlen_enc = vector_length_encoding(this);
 8579     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8580     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8581   %}
 8582   ins_pipe( pipe_slow );
 8583 %}
 8584 
 8585 instruct vmaskcast_evex(kReg dst) %{
 8586   match(Set dst (VectorMaskCast dst));
 8587   ins_cost(0);
 8588   format %{ "vector_mask_cast $dst" %}
 8589   ins_encode %{
 8590     // empty
 8591   %}
 8592   ins_pipe(empty);
 8593 %}
 8594 
 8595 instruct vmaskcast(vec dst) %{
 8596   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8597   match(Set dst (VectorMaskCast dst));
 8598   ins_cost(0);
 8599   format %{ "vector_mask_cast $dst" %}
 8600   ins_encode %{
 8601     // empty
 8602   %}
 8603   ins_pipe(empty);
 8604 %}
 8605 
 8606 instruct vmaskcast_avx(vec dst, vec src) %{
 8607   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8608   match(Set dst (VectorMaskCast src));
 8609   format %{ "vector_mask_cast $dst, $src" %}
 8610   ins_encode %{
 8611     int vlen = Matcher::vector_length(this);
 8612     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8613     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8614     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8615   %}
 8616   ins_pipe(pipe_slow);
 8617 %}
 8618 
 8619 //-------------------------------- Load Iota Indices ----------------------------------
 8620 
 8621 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8622   match(Set dst (VectorLoadConst src));
 8623   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8624   ins_encode %{
 8625      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8626      BasicType bt = Matcher::vector_element_basic_type(this);
 8627      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8628   %}
 8629   ins_pipe( pipe_slow );
 8630 %}
 8631 
 8632 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8633   match(Set dst (PopulateIndex src1 src2));
 8634   effect(TEMP dst, TEMP vtmp);
 8635   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8636   ins_encode %{
 8637      assert($src2$$constant == 1, "required");
 8638      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8639      int vlen_enc = vector_length_encoding(this);
 8640      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8641      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8642      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8643      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8644   %}
 8645   ins_pipe( pipe_slow );
 8646 %}
 8647 
 8648 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8649   match(Set dst (PopulateIndex src1 src2));
 8650   effect(TEMP dst, TEMP vtmp);
 8651   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8652   ins_encode %{
 8653      assert($src2$$constant == 1, "required");
 8654      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8655      int vlen_enc = vector_length_encoding(this);
 8656      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8657      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8658      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8659      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8660   %}
 8661   ins_pipe( pipe_slow );
 8662 %}
 8663 
 8664 //-------------------------------- Rearrange ----------------------------------
 8665 
 8666 // LoadShuffle/Rearrange for Byte
 8667 instruct rearrangeB(vec dst, vec shuffle) %{
 8668   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8669             Matcher::vector_length(n) < 32);
 8670   match(Set dst (VectorRearrange dst shuffle));
 8671   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8672   ins_encode %{
 8673     assert(UseSSE >= 4, "required");
 8674     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8675   %}
 8676   ins_pipe( pipe_slow );
 8677 %}
 8678 
 8679 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8680   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8681             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8682   match(Set dst (VectorRearrange src shuffle));
 8683   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8684   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8685   ins_encode %{
 8686     assert(UseAVX >= 2, "required");
 8687     // Swap src into vtmp1
 8688     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8689     // Shuffle swapped src to get entries from other 128 bit lane
 8690     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8691     // Shuffle original src to get entries from self 128 bit lane
 8692     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8693     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8694     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8695     // Perform the blend
 8696     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8697   %}
 8698   ins_pipe( pipe_slow );
 8699 %}
 8700 
 8701 
 8702 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8703   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8704             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8705   match(Set dst (VectorRearrange src shuffle));
 8706   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8707   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8708   ins_encode %{
 8709     int vlen_enc = vector_length_encoding(this);
 8710     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8711                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8712                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8713   %}
 8714   ins_pipe( pipe_slow );
 8715 %}
 8716 
 8717 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8718   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8719             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8720   match(Set dst (VectorRearrange src shuffle));
 8721   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8722   ins_encode %{
 8723     int vlen_enc = vector_length_encoding(this);
 8724     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8725   %}
 8726   ins_pipe( pipe_slow );
 8727 %}
 8728 
 8729 // LoadShuffle/Rearrange for Short
 8730 
 8731 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8732   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8733             !VM_Version::supports_avx512bw());
 8734   match(Set dst (VectorLoadShuffle src));
 8735   effect(TEMP dst, TEMP vtmp);
 8736   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8737   ins_encode %{
 8738     // Create a byte shuffle mask from short shuffle mask
 8739     // only byte shuffle instruction available on these platforms
 8740     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8741     if (UseAVX == 0) {
 8742       assert(vlen_in_bytes <= 16, "required");
 8743       // Multiply each shuffle by two to get byte index
 8744       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8745       __ psllw($vtmp$$XMMRegister, 1);
 8746 
 8747       // Duplicate to create 2 copies of byte index
 8748       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8749       __ psllw($dst$$XMMRegister, 8);
 8750       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8751 
 8752       // Add one to get alternate byte index
 8753       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8754       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8755     } else {
 8756       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8757       int vlen_enc = vector_length_encoding(this);
 8758       // Multiply each shuffle by two to get byte index
 8759       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8760 
 8761       // Duplicate to create 2 copies of byte index
 8762       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8763       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8764 
 8765       // Add one to get alternate byte index
 8766       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8767     }
 8768   %}
 8769   ins_pipe( pipe_slow );
 8770 %}
 8771 
 8772 instruct rearrangeS(vec dst, vec shuffle) %{
 8773   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8774             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8775   match(Set dst (VectorRearrange dst shuffle));
 8776   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8777   ins_encode %{
 8778     assert(UseSSE >= 4, "required");
 8779     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8780   %}
 8781   ins_pipe( pipe_slow );
 8782 %}
 8783 
 8784 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8785   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8786             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8787   match(Set dst (VectorRearrange src shuffle));
 8788   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8789   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8790   ins_encode %{
 8791     assert(UseAVX >= 2, "required");
 8792     // Swap src into vtmp1
 8793     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8794     // Shuffle swapped src to get entries from other 128 bit lane
 8795     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8796     // Shuffle original src to get entries from self 128 bit lane
 8797     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8798     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8799     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8800     // Perform the blend
 8801     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8802   %}
 8803   ins_pipe( pipe_slow );
 8804 %}
 8805 
 8806 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8807   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8808             VM_Version::supports_avx512bw());
 8809   match(Set dst (VectorRearrange src shuffle));
 8810   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8811   ins_encode %{
 8812     int vlen_enc = vector_length_encoding(this);
 8813     if (!VM_Version::supports_avx512vl()) {
 8814       vlen_enc = Assembler::AVX_512bit;
 8815     }
 8816     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8817   %}
 8818   ins_pipe( pipe_slow );
 8819 %}
 8820 
 8821 // LoadShuffle/Rearrange for Integer and Float
 8822 
 8823 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8824   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8825             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8826   match(Set dst (VectorLoadShuffle src));
 8827   effect(TEMP dst, TEMP vtmp);
 8828   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8829   ins_encode %{
 8830     assert(UseSSE >= 4, "required");
 8831 
 8832     // Create a byte shuffle mask from int shuffle mask
 8833     // only byte shuffle instruction available on these platforms
 8834 
 8835     // Duplicate and multiply each shuffle by 4
 8836     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8837     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8838     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8839     __ psllw($vtmp$$XMMRegister, 2);
 8840 
 8841     // Duplicate again to create 4 copies of byte index
 8842     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8843     __ psllw($dst$$XMMRegister, 8);
 8844     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8845 
 8846     // Add 3,2,1,0 to get alternate byte index
 8847     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8848     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8849   %}
 8850   ins_pipe( pipe_slow );
 8851 %}
 8852 
 8853 instruct rearrangeI(vec dst, vec shuffle) %{
 8854   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8855             UseAVX == 0);
 8856   match(Set dst (VectorRearrange dst shuffle));
 8857   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8858   ins_encode %{
 8859     assert(UseSSE >= 4, "required");
 8860     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8861   %}
 8862   ins_pipe( pipe_slow );
 8863 %}
 8864 
 8865 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8866   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8867             UseAVX > 0);
 8868   match(Set dst (VectorRearrange src shuffle));
 8869   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8870   ins_encode %{
 8871     int vlen_enc = vector_length_encoding(this);
 8872     BasicType bt = Matcher::vector_element_basic_type(this);
 8873     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8874   %}
 8875   ins_pipe( pipe_slow );
 8876 %}
 8877 
 8878 // LoadShuffle/Rearrange for Long and Double
 8879 
 8880 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8881   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8882             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8883   match(Set dst (VectorLoadShuffle src));
 8884   effect(TEMP dst, TEMP vtmp);
 8885   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8886   ins_encode %{
 8887     assert(UseAVX >= 2, "required");
 8888 
 8889     int vlen_enc = vector_length_encoding(this);
 8890     // Create a double word shuffle mask from long shuffle mask
 8891     // only double word shuffle instruction available on these platforms
 8892 
 8893     // Multiply each shuffle by two to get double word index
 8894     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8895 
 8896     // Duplicate each double word shuffle
 8897     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8898     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8899 
 8900     // Add one to get alternate double word index
 8901     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8902   %}
 8903   ins_pipe( pipe_slow );
 8904 %}
 8905 
 8906 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8907   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8908             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8909   match(Set dst (VectorRearrange src shuffle));
 8910   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8911   ins_encode %{
 8912     assert(UseAVX >= 2, "required");
 8913 
 8914     int vlen_enc = vector_length_encoding(this);
 8915     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8916   %}
 8917   ins_pipe( pipe_slow );
 8918 %}
 8919 
 8920 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8921   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8922             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8923   match(Set dst (VectorRearrange src shuffle));
 8924   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8925   ins_encode %{
 8926     assert(UseAVX > 2, "required");
 8927 
 8928     int vlen_enc = vector_length_encoding(this);
 8929     if (vlen_enc == Assembler::AVX_128bit) {
 8930       vlen_enc = Assembler::AVX_256bit;
 8931     }
 8932     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8933   %}
 8934   ins_pipe( pipe_slow );
 8935 %}
 8936 
 8937 // --------------------------------- FMA --------------------------------------
 8938 // a * b + c
 8939 
 8940 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8941   match(Set c (FmaVF  c (Binary a b)));
 8942   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8943   ins_cost(150);
 8944   ins_encode %{
 8945     assert(UseFMA, "not enabled");
 8946     int vlen_enc = vector_length_encoding(this);
 8947     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8948   %}
 8949   ins_pipe( pipe_slow );
 8950 %}
 8951 
 8952 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8953   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8954   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8955   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8956   ins_cost(150);
 8957   ins_encode %{
 8958     assert(UseFMA, "not enabled");
 8959     int vlen_enc = vector_length_encoding(this);
 8960     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8961   %}
 8962   ins_pipe( pipe_slow );
 8963 %}
 8964 
 8965 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8966   match(Set c (FmaVD  c (Binary a b)));
 8967   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8968   ins_cost(150);
 8969   ins_encode %{
 8970     assert(UseFMA, "not enabled");
 8971     int vlen_enc = vector_length_encoding(this);
 8972     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8973   %}
 8974   ins_pipe( pipe_slow );
 8975 %}
 8976 
 8977 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8978   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8979   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8980   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8981   ins_cost(150);
 8982   ins_encode %{
 8983     assert(UseFMA, "not enabled");
 8984     int vlen_enc = vector_length_encoding(this);
 8985     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8986   %}
 8987   ins_pipe( pipe_slow );
 8988 %}
 8989 
 8990 // --------------------------------- Vector Multiply Add --------------------------------------
 8991 
 8992 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8993   predicate(UseAVX == 0);
 8994   match(Set dst (MulAddVS2VI dst src1));
 8995   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8996   ins_encode %{
 8997     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8998   %}
 8999   ins_pipe( pipe_slow );
 9000 %}
 9001 
 9002 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9003   predicate(UseAVX > 0);
 9004   match(Set dst (MulAddVS2VI src1 src2));
 9005   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9006   ins_encode %{
 9007     int vlen_enc = vector_length_encoding(this);
 9008     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9009   %}
 9010   ins_pipe( pipe_slow );
 9011 %}
 9012 
 9013 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9014 
 9015 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9016   predicate(VM_Version::supports_avx512_vnni());
 9017   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9018   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9019   ins_encode %{
 9020     assert(UseAVX > 2, "required");
 9021     int vlen_enc = vector_length_encoding(this);
 9022     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9023   %}
 9024   ins_pipe( pipe_slow );
 9025   ins_cost(10);
 9026 %}
 9027 
 9028 // --------------------------------- PopCount --------------------------------------
 9029 
 9030 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9031   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9032   match(Set dst (PopCountVI src));
 9033   match(Set dst (PopCountVL src));
 9034   format %{ "vector_popcount_integral $dst, $src" %}
 9035   ins_encode %{
 9036     int opcode = this->ideal_Opcode();
 9037     int vlen_enc = vector_length_encoding(this, $src);
 9038     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9039     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9040   %}
 9041   ins_pipe( pipe_slow );
 9042 %}
 9043 
 9044 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9045   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9046   match(Set dst (PopCountVI src mask));
 9047   match(Set dst (PopCountVL src mask));
 9048   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9049   ins_encode %{
 9050     int vlen_enc = vector_length_encoding(this, $src);
 9051     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9052     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9053     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9054   %}
 9055   ins_pipe( pipe_slow );
 9056 %}
 9057 
 9058 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9059   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9060   match(Set dst (PopCountVI src));
 9061   match(Set dst (PopCountVL src));
 9062   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9063   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9064   ins_encode %{
 9065     int opcode = this->ideal_Opcode();
 9066     int vlen_enc = vector_length_encoding(this, $src);
 9067     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9068     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9069                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9070   %}
 9071   ins_pipe( pipe_slow );
 9072 %}
 9073 
 9074 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9075 
 9076 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9077   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9078                                               Matcher::vector_length_in_bytes(n->in(1))));
 9079   match(Set dst (CountTrailingZerosV src));
 9080   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9081   ins_cost(400);
 9082   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9083   ins_encode %{
 9084     int vlen_enc = vector_length_encoding(this, $src);
 9085     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9086     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9087                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9088   %}
 9089   ins_pipe( pipe_slow );
 9090 %}
 9091 
 9092 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9093   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9094             VM_Version::supports_avx512cd() &&
 9095             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9096   match(Set dst (CountTrailingZerosV src));
 9097   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9098   ins_cost(400);
 9099   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9100   ins_encode %{
 9101     int vlen_enc = vector_length_encoding(this, $src);
 9102     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9103     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9104                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9105   %}
 9106   ins_pipe( pipe_slow );
 9107 %}
 9108 
 9109 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9110   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9111   match(Set dst (CountTrailingZerosV src));
 9112   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9113   ins_cost(400);
 9114   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9115   ins_encode %{
 9116     int vlen_enc = vector_length_encoding(this, $src);
 9117     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9118     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9119                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9120                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9121   %}
 9122   ins_pipe( pipe_slow );
 9123 %}
 9124 
 9125 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9126   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9127   match(Set dst (CountTrailingZerosV src));
 9128   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9129   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9130   ins_encode %{
 9131     int vlen_enc = vector_length_encoding(this, $src);
 9132     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9133     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9134                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9135   %}
 9136   ins_pipe( pipe_slow );
 9137 %}
 9138 
 9139 
 9140 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9141 
 9142 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9143   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9144   effect(TEMP dst);
 9145   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9146   ins_encode %{
 9147     int vector_len = vector_length_encoding(this);
 9148     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9149   %}
 9150   ins_pipe( pipe_slow );
 9151 %}
 9152 
 9153 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9154   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9155   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9156   effect(TEMP dst);
 9157   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9158   ins_encode %{
 9159     int vector_len = vector_length_encoding(this);
 9160     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9161   %}
 9162   ins_pipe( pipe_slow );
 9163 %}
 9164 
 9165 // --------------------------------- Rotation Operations ----------------------------------
 9166 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9167   match(Set dst (RotateLeftV src shift));
 9168   match(Set dst (RotateRightV src shift));
 9169   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9170   ins_encode %{
 9171     int opcode      = this->ideal_Opcode();
 9172     int vector_len  = vector_length_encoding(this);
 9173     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9174     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9175   %}
 9176   ins_pipe( pipe_slow );
 9177 %}
 9178 
 9179 instruct vprorate(vec dst, vec src, vec shift) %{
 9180   match(Set dst (RotateLeftV src shift));
 9181   match(Set dst (RotateRightV src shift));
 9182   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9183   ins_encode %{
 9184     int opcode      = this->ideal_Opcode();
 9185     int vector_len  = vector_length_encoding(this);
 9186     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9187     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9188   %}
 9189   ins_pipe( pipe_slow );
 9190 %}
 9191 
 9192 // ---------------------------------- Masked Operations ------------------------------------
 9193 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9194   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9195   match(Set dst (LoadVectorMasked mem mask));
 9196   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9197   ins_encode %{
 9198     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9199     int vlen_enc = vector_length_encoding(this);
 9200     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9201   %}
 9202   ins_pipe( pipe_slow );
 9203 %}
 9204 
 9205 
 9206 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9207   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9208   match(Set dst (LoadVectorMasked mem mask));
 9209   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9210   ins_encode %{
 9211     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9212     int vector_len = vector_length_encoding(this);
 9213     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9214   %}
 9215   ins_pipe( pipe_slow );
 9216 %}
 9217 
 9218 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9219   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9220   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9221   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9222   ins_encode %{
 9223     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9224     int vlen_enc = vector_length_encoding(src_node);
 9225     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9226     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9227   %}
 9228   ins_pipe( pipe_slow );
 9229 %}
 9230 
 9231 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9232   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9233   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9234   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9235   ins_encode %{
 9236     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9237     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9238     int vlen_enc = vector_length_encoding(src_node);
 9239     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9240   %}
 9241   ins_pipe( pipe_slow );
 9242 %}
 9243 
 9244 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9245   match(Set addr (VerifyVectorAlignment addr mask));
 9246   effect(KILL cr);
 9247   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9248   ins_encode %{
 9249     Label Lskip;
 9250     // check if masked bits of addr are zero
 9251     __ testq($addr$$Register, $mask$$constant);
 9252     __ jccb(Assembler::equal, Lskip);
 9253     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9254     __ bind(Lskip);
 9255   %}
 9256   ins_pipe(pipe_slow);
 9257 %}
 9258 
 9259 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9260   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9261   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9262   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9263   ins_encode %{
 9264     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9265     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9266 
 9267     Label DONE;
 9268     int vlen_enc = vector_length_encoding(this, $src1);
 9269     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9270 
 9271     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9272     __ mov64($dst$$Register, -1L);
 9273     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9274     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9275     __ jccb(Assembler::carrySet, DONE);
 9276     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9277     __ notq($dst$$Register);
 9278     __ tzcntq($dst$$Register, $dst$$Register);
 9279     __ bind(DONE);
 9280   %}
 9281   ins_pipe( pipe_slow );
 9282 %}
 9283 
 9284 
 9285 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9286   match(Set dst (VectorMaskGen len));
 9287   effect(TEMP temp, KILL cr);
 9288   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9289   ins_encode %{
 9290     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9291   %}
 9292   ins_pipe( pipe_slow );
 9293 %}
 9294 
 9295 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9296   match(Set dst (VectorMaskGen len));
 9297   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9298   effect(TEMP temp);
 9299   ins_encode %{
 9300     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9301     __ kmovql($dst$$KRegister, $temp$$Register);
 9302   %}
 9303   ins_pipe( pipe_slow );
 9304 %}
 9305 
 9306 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9307   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9308   match(Set dst (VectorMaskToLong mask));
 9309   effect(TEMP dst, KILL cr);
 9310   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9311   ins_encode %{
 9312     int opcode = this->ideal_Opcode();
 9313     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9314     int mask_len = Matcher::vector_length(this, $mask);
 9315     int mask_size = mask_len * type2aelembytes(mbt);
 9316     int vlen_enc = vector_length_encoding(this, $mask);
 9317     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9318                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9319   %}
 9320   ins_pipe( pipe_slow );
 9321 %}
 9322 
 9323 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9324   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9325   match(Set dst (VectorMaskToLong mask));
 9326   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9327   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9328   ins_encode %{
 9329     int opcode = this->ideal_Opcode();
 9330     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9331     int mask_len = Matcher::vector_length(this, $mask);
 9332     int vlen_enc = vector_length_encoding(this, $mask);
 9333     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9334                              $dst$$Register, mask_len, mbt, vlen_enc);
 9335   %}
 9336   ins_pipe( pipe_slow );
 9337 %}
 9338 
 9339 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9340   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9341   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9342   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9343   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9344   ins_encode %{
 9345     int opcode = this->ideal_Opcode();
 9346     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9347     int mask_len = Matcher::vector_length(this, $mask);
 9348     int vlen_enc = vector_length_encoding(this, $mask);
 9349     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9350                              $dst$$Register, mask_len, mbt, vlen_enc);
 9351   %}
 9352   ins_pipe( pipe_slow );
 9353 %}
 9354 
 9355 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9356   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9357   match(Set dst (VectorMaskTrueCount mask));
 9358   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9359   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9360   ins_encode %{
 9361     int opcode = this->ideal_Opcode();
 9362     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9363     int mask_len = Matcher::vector_length(this, $mask);
 9364     int mask_size = mask_len * type2aelembytes(mbt);
 9365     int vlen_enc = vector_length_encoding(this, $mask);
 9366     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9367                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9368   %}
 9369   ins_pipe( pipe_slow );
 9370 %}
 9371 
 9372 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9373   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9374   match(Set dst (VectorMaskTrueCount mask));
 9375   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9376   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9377   ins_encode %{
 9378     int opcode = this->ideal_Opcode();
 9379     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9380     int mask_len = Matcher::vector_length(this, $mask);
 9381     int vlen_enc = vector_length_encoding(this, $mask);
 9382     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9383                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9384   %}
 9385   ins_pipe( pipe_slow );
 9386 %}
 9387 
 9388 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9389   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9390   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9391   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9392   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9393   ins_encode %{
 9394     int opcode = this->ideal_Opcode();
 9395     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9396     int mask_len = Matcher::vector_length(this, $mask);
 9397     int vlen_enc = vector_length_encoding(this, $mask);
 9398     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9399                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9400   %}
 9401   ins_pipe( pipe_slow );
 9402 %}
 9403 
 9404 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9405   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9406   match(Set dst (VectorMaskFirstTrue mask));
 9407   match(Set dst (VectorMaskLastTrue mask));
 9408   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9409   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9410   ins_encode %{
 9411     int opcode = this->ideal_Opcode();
 9412     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9413     int mask_len = Matcher::vector_length(this, $mask);
 9414     int mask_size = mask_len * type2aelembytes(mbt);
 9415     int vlen_enc = vector_length_encoding(this, $mask);
 9416     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9417                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9418   %}
 9419   ins_pipe( pipe_slow );
 9420 %}
 9421 
 9422 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9423   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9424   match(Set dst (VectorMaskFirstTrue mask));
 9425   match(Set dst (VectorMaskLastTrue mask));
 9426   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9427   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9428   ins_encode %{
 9429     int opcode = this->ideal_Opcode();
 9430     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9431     int mask_len = Matcher::vector_length(this, $mask);
 9432     int vlen_enc = vector_length_encoding(this, $mask);
 9433     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9434                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9435   %}
 9436   ins_pipe( pipe_slow );
 9437 %}
 9438 
 9439 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9440   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9441   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9442   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9443   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9444   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9445   ins_encode %{
 9446     int opcode = this->ideal_Opcode();
 9447     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9448     int mask_len = Matcher::vector_length(this, $mask);
 9449     int vlen_enc = vector_length_encoding(this, $mask);
 9450     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9451                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9452   %}
 9453   ins_pipe( pipe_slow );
 9454 %}
 9455 
 9456 // --------------------------------- Compress/Expand Operations ---------------------------
 9457 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9458   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9459   match(Set dst (CompressV src mask));
 9460   match(Set dst (ExpandV src mask));
 9461   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9462   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9463   ins_encode %{
 9464     int opcode = this->ideal_Opcode();
 9465     int vlen_enc = vector_length_encoding(this);
 9466     BasicType bt  = Matcher::vector_element_basic_type(this);
 9467     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9468                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9469   %}
 9470   ins_pipe( pipe_slow );
 9471 %}
 9472 
 9473 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9474   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9475   match(Set dst (CompressV src mask));
 9476   match(Set dst (ExpandV src mask));
 9477   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9478   ins_encode %{
 9479     int opcode = this->ideal_Opcode();
 9480     int vector_len = vector_length_encoding(this);
 9481     BasicType bt  = Matcher::vector_element_basic_type(this);
 9482     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9483   %}
 9484   ins_pipe( pipe_slow );
 9485 %}
 9486 
 9487 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9488   match(Set dst (CompressM mask));
 9489   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9490   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9491   ins_encode %{
 9492     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9493     int mask_len = Matcher::vector_length(this);
 9494     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9495   %}
 9496   ins_pipe( pipe_slow );
 9497 %}
 9498 
 9499 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9500 
 9501 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9502   predicate(!VM_Version::supports_gfni());
 9503   match(Set dst (ReverseV src));
 9504   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9505   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9506   ins_encode %{
 9507     int vec_enc = vector_length_encoding(this);
 9508     BasicType bt = Matcher::vector_element_basic_type(this);
 9509     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9510                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9511   %}
 9512   ins_pipe( pipe_slow );
 9513 %}
 9514 
 9515 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9516   predicate(VM_Version::supports_gfni());
 9517   match(Set dst (ReverseV src));
 9518   effect(TEMP dst, TEMP xtmp);
 9519   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9520   ins_encode %{
 9521     int vec_enc = vector_length_encoding(this);
 9522     BasicType bt  = Matcher::vector_element_basic_type(this);
 9523     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9524     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9525                                $xtmp$$XMMRegister);
 9526   %}
 9527   ins_pipe( pipe_slow );
 9528 %}
 9529 
 9530 instruct vreverse_byte_reg(vec dst, vec src) %{
 9531   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9532   match(Set dst (ReverseBytesV src));
 9533   effect(TEMP dst);
 9534   format %{ "vector_reverse_byte $dst, $src" %}
 9535   ins_encode %{
 9536     int vec_enc = vector_length_encoding(this);
 9537     BasicType bt = Matcher::vector_element_basic_type(this);
 9538     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9539   %}
 9540   ins_pipe( pipe_slow );
 9541 %}
 9542 
 9543 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9544   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9545   match(Set dst (ReverseBytesV src));
 9546   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9547   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9548   ins_encode %{
 9549     int vec_enc = vector_length_encoding(this);
 9550     BasicType bt = Matcher::vector_element_basic_type(this);
 9551     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9552                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9553   %}
 9554   ins_pipe( pipe_slow );
 9555 %}
 9556 
 9557 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9558 
 9559 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9560   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9561                                               Matcher::vector_length_in_bytes(n->in(1))));
 9562   match(Set dst (CountLeadingZerosV src));
 9563   format %{ "vector_count_leading_zeros $dst, $src" %}
 9564   ins_encode %{
 9565      int vlen_enc = vector_length_encoding(this, $src);
 9566      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9567      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9568                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9569   %}
 9570   ins_pipe( pipe_slow );
 9571 %}
 9572 
 9573 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9574   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9575                                               Matcher::vector_length_in_bytes(n->in(1))));
 9576   match(Set dst (CountLeadingZerosV src mask));
 9577   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9578   ins_encode %{
 9579     int vlen_enc = vector_length_encoding(this, $src);
 9580     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9581     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9582     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9583                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9584   %}
 9585   ins_pipe( pipe_slow );
 9586 %}
 9587 
 9588 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9589   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9590             VM_Version::supports_avx512cd() &&
 9591             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9592   match(Set dst (CountLeadingZerosV src));
 9593   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9594   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9595   ins_encode %{
 9596     int vlen_enc = vector_length_encoding(this, $src);
 9597     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9598     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9599                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9600   %}
 9601   ins_pipe( pipe_slow );
 9602 %}
 9603 
 9604 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9605   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9606   match(Set dst (CountLeadingZerosV src));
 9607   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9608   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9609   ins_encode %{
 9610     int vlen_enc = vector_length_encoding(this, $src);
 9611     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9612     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9613                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9614                                        $rtmp$$Register, true, vlen_enc);
 9615   %}
 9616   ins_pipe( pipe_slow );
 9617 %}
 9618 
 9619 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9620   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9621             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9622   match(Set dst (CountLeadingZerosV src));
 9623   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9624   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9625   ins_encode %{
 9626     int vlen_enc = vector_length_encoding(this, $src);
 9627     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9628     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9629                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9630   %}
 9631   ins_pipe( pipe_slow );
 9632 %}
 9633 
 9634 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9635   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9636             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9637   match(Set dst (CountLeadingZerosV src));
 9638   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9639   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9640   ins_encode %{
 9641     int vlen_enc = vector_length_encoding(this, $src);
 9642     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9643     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9644                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9645   %}
 9646   ins_pipe( pipe_slow );
 9647 %}
 9648 
 9649 // ---------------------------------- Vector Masked Operations ------------------------------------
 9650 
 9651 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9652   match(Set dst (AddVB (Binary dst src2) mask));
 9653   match(Set dst (AddVS (Binary dst src2) mask));
 9654   match(Set dst (AddVI (Binary dst src2) mask));
 9655   match(Set dst (AddVL (Binary dst src2) mask));
 9656   match(Set dst (AddVF (Binary dst src2) mask));
 9657   match(Set dst (AddVD (Binary dst src2) mask));
 9658   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9659   ins_encode %{
 9660     int vlen_enc = vector_length_encoding(this);
 9661     BasicType bt = Matcher::vector_element_basic_type(this);
 9662     int opc = this->ideal_Opcode();
 9663     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9664                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9665   %}
 9666   ins_pipe( pipe_slow );
 9667 %}
 9668 
 9669 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9670   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9671   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9672   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9673   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9674   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9675   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9676   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9677   ins_encode %{
 9678     int vlen_enc = vector_length_encoding(this);
 9679     BasicType bt = Matcher::vector_element_basic_type(this);
 9680     int opc = this->ideal_Opcode();
 9681     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9682                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9683   %}
 9684   ins_pipe( pipe_slow );
 9685 %}
 9686 
 9687 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9688   match(Set dst (XorV (Binary dst src2) mask));
 9689   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9690   ins_encode %{
 9691     int vlen_enc = vector_length_encoding(this);
 9692     BasicType bt = Matcher::vector_element_basic_type(this);
 9693     int opc = this->ideal_Opcode();
 9694     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9695                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9696   %}
 9697   ins_pipe( pipe_slow );
 9698 %}
 9699 
 9700 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9701   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9702   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9703   ins_encode %{
 9704     int vlen_enc = vector_length_encoding(this);
 9705     BasicType bt = Matcher::vector_element_basic_type(this);
 9706     int opc = this->ideal_Opcode();
 9707     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9708                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9709   %}
 9710   ins_pipe( pipe_slow );
 9711 %}
 9712 
 9713 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9714   match(Set dst (OrV (Binary dst src2) mask));
 9715   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9716   ins_encode %{
 9717     int vlen_enc = vector_length_encoding(this);
 9718     BasicType bt = Matcher::vector_element_basic_type(this);
 9719     int opc = this->ideal_Opcode();
 9720     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9721                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9722   %}
 9723   ins_pipe( pipe_slow );
 9724 %}
 9725 
 9726 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9727   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9728   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9729   ins_encode %{
 9730     int vlen_enc = vector_length_encoding(this);
 9731     BasicType bt = Matcher::vector_element_basic_type(this);
 9732     int opc = this->ideal_Opcode();
 9733     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9734                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9735   %}
 9736   ins_pipe( pipe_slow );
 9737 %}
 9738 
 9739 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9740   match(Set dst (AndV (Binary dst src2) mask));
 9741   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9742   ins_encode %{
 9743     int vlen_enc = vector_length_encoding(this);
 9744     BasicType bt = Matcher::vector_element_basic_type(this);
 9745     int opc = this->ideal_Opcode();
 9746     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9747                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9748   %}
 9749   ins_pipe( pipe_slow );
 9750 %}
 9751 
 9752 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9753   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9754   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9755   ins_encode %{
 9756     int vlen_enc = vector_length_encoding(this);
 9757     BasicType bt = Matcher::vector_element_basic_type(this);
 9758     int opc = this->ideal_Opcode();
 9759     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9760                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9761   %}
 9762   ins_pipe( pipe_slow );
 9763 %}
 9764 
 9765 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9766   match(Set dst (SubVB (Binary dst src2) mask));
 9767   match(Set dst (SubVS (Binary dst src2) mask));
 9768   match(Set dst (SubVI (Binary dst src2) mask));
 9769   match(Set dst (SubVL (Binary dst src2) mask));
 9770   match(Set dst (SubVF (Binary dst src2) mask));
 9771   match(Set dst (SubVD (Binary dst src2) mask));
 9772   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9773   ins_encode %{
 9774     int vlen_enc = vector_length_encoding(this);
 9775     BasicType bt = Matcher::vector_element_basic_type(this);
 9776     int opc = this->ideal_Opcode();
 9777     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9778                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9779   %}
 9780   ins_pipe( pipe_slow );
 9781 %}
 9782 
 9783 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9784   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9785   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9786   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9787   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9788   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9789   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9790   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9791   ins_encode %{
 9792     int vlen_enc = vector_length_encoding(this);
 9793     BasicType bt = Matcher::vector_element_basic_type(this);
 9794     int opc = this->ideal_Opcode();
 9795     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9796                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9797   %}
 9798   ins_pipe( pipe_slow );
 9799 %}
 9800 
 9801 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9802   match(Set dst (MulVS (Binary dst src2) mask));
 9803   match(Set dst (MulVI (Binary dst src2) mask));
 9804   match(Set dst (MulVL (Binary dst src2) mask));
 9805   match(Set dst (MulVF (Binary dst src2) mask));
 9806   match(Set dst (MulVD (Binary dst src2) mask));
 9807   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9808   ins_encode %{
 9809     int vlen_enc = vector_length_encoding(this);
 9810     BasicType bt = Matcher::vector_element_basic_type(this);
 9811     int opc = this->ideal_Opcode();
 9812     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9813                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9814   %}
 9815   ins_pipe( pipe_slow );
 9816 %}
 9817 
 9818 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9819   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9820   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9821   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9822   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9823   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9824   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9825   ins_encode %{
 9826     int vlen_enc = vector_length_encoding(this);
 9827     BasicType bt = Matcher::vector_element_basic_type(this);
 9828     int opc = this->ideal_Opcode();
 9829     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9830                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9831   %}
 9832   ins_pipe( pipe_slow );
 9833 %}
 9834 
 9835 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9836   match(Set dst (SqrtVF dst mask));
 9837   match(Set dst (SqrtVD dst mask));
 9838   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9839   ins_encode %{
 9840     int vlen_enc = vector_length_encoding(this);
 9841     BasicType bt = Matcher::vector_element_basic_type(this);
 9842     int opc = this->ideal_Opcode();
 9843     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9844                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9845   %}
 9846   ins_pipe( pipe_slow );
 9847 %}
 9848 
 9849 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9850   match(Set dst (DivVF (Binary dst src2) mask));
 9851   match(Set dst (DivVD (Binary dst src2) mask));
 9852   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9853   ins_encode %{
 9854     int vlen_enc = vector_length_encoding(this);
 9855     BasicType bt = Matcher::vector_element_basic_type(this);
 9856     int opc = this->ideal_Opcode();
 9857     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9858                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9859   %}
 9860   ins_pipe( pipe_slow );
 9861 %}
 9862 
 9863 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9864   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9865   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9866   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9867   ins_encode %{
 9868     int vlen_enc = vector_length_encoding(this);
 9869     BasicType bt = Matcher::vector_element_basic_type(this);
 9870     int opc = this->ideal_Opcode();
 9871     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9872                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9873   %}
 9874   ins_pipe( pipe_slow );
 9875 %}
 9876 
 9877 
 9878 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9879   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9880   match(Set dst (RotateRightV (Binary dst shift) mask));
 9881   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9882   ins_encode %{
 9883     int vlen_enc = vector_length_encoding(this);
 9884     BasicType bt = Matcher::vector_element_basic_type(this);
 9885     int opc = this->ideal_Opcode();
 9886     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9887                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9888   %}
 9889   ins_pipe( pipe_slow );
 9890 %}
 9891 
 9892 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9893   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9894   match(Set dst (RotateRightV (Binary dst src2) mask));
 9895   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9896   ins_encode %{
 9897     int vlen_enc = vector_length_encoding(this);
 9898     BasicType bt = Matcher::vector_element_basic_type(this);
 9899     int opc = this->ideal_Opcode();
 9900     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9901                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9902   %}
 9903   ins_pipe( pipe_slow );
 9904 %}
 9905 
 9906 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9907   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9908   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9909   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9910   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9911   ins_encode %{
 9912     int vlen_enc = vector_length_encoding(this);
 9913     BasicType bt = Matcher::vector_element_basic_type(this);
 9914     int opc = this->ideal_Opcode();
 9915     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9916                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9917   %}
 9918   ins_pipe( pipe_slow );
 9919 %}
 9920 
 9921 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9922   predicate(!n->as_ShiftV()->is_var_shift());
 9923   match(Set dst (LShiftVS (Binary dst src2) mask));
 9924   match(Set dst (LShiftVI (Binary dst src2) mask));
 9925   match(Set dst (LShiftVL (Binary dst src2) mask));
 9926   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9927   ins_encode %{
 9928     int vlen_enc = vector_length_encoding(this);
 9929     BasicType bt = Matcher::vector_element_basic_type(this);
 9930     int opc = this->ideal_Opcode();
 9931     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9932                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9933   %}
 9934   ins_pipe( pipe_slow );
 9935 %}
 9936 
 9937 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9938   predicate(n->as_ShiftV()->is_var_shift());
 9939   match(Set dst (LShiftVS (Binary dst src2) mask));
 9940   match(Set dst (LShiftVI (Binary dst src2) mask));
 9941   match(Set dst (LShiftVL (Binary dst src2) mask));
 9942   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9943   ins_encode %{
 9944     int vlen_enc = vector_length_encoding(this);
 9945     BasicType bt = Matcher::vector_element_basic_type(this);
 9946     int opc = this->ideal_Opcode();
 9947     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9948                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9949   %}
 9950   ins_pipe( pipe_slow );
 9951 %}
 9952 
 9953 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9954   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9955   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9956   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9957   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9958   ins_encode %{
 9959     int vlen_enc = vector_length_encoding(this);
 9960     BasicType bt = Matcher::vector_element_basic_type(this);
 9961     int opc = this->ideal_Opcode();
 9962     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9963                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9964   %}
 9965   ins_pipe( pipe_slow );
 9966 %}
 9967 
 9968 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9969   predicate(!n->as_ShiftV()->is_var_shift());
 9970   match(Set dst (RShiftVS (Binary dst src2) mask));
 9971   match(Set dst (RShiftVI (Binary dst src2) mask));
 9972   match(Set dst (RShiftVL (Binary dst src2) mask));
 9973   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9974   ins_encode %{
 9975     int vlen_enc = vector_length_encoding(this);
 9976     BasicType bt = Matcher::vector_element_basic_type(this);
 9977     int opc = this->ideal_Opcode();
 9978     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9979                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9980   %}
 9981   ins_pipe( pipe_slow );
 9982 %}
 9983 
 9984 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9985   predicate(n->as_ShiftV()->is_var_shift());
 9986   match(Set dst (RShiftVS (Binary dst src2) mask));
 9987   match(Set dst (RShiftVI (Binary dst src2) mask));
 9988   match(Set dst (RShiftVL (Binary dst src2) mask));
 9989   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9990   ins_encode %{
 9991     int vlen_enc = vector_length_encoding(this);
 9992     BasicType bt = Matcher::vector_element_basic_type(this);
 9993     int opc = this->ideal_Opcode();
 9994     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9995                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9996   %}
 9997   ins_pipe( pipe_slow );
 9998 %}
 9999 
10000 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10001   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10002   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10003   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10004   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10005   ins_encode %{
10006     int vlen_enc = vector_length_encoding(this);
10007     BasicType bt = Matcher::vector_element_basic_type(this);
10008     int opc = this->ideal_Opcode();
10009     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10010                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10011   %}
10012   ins_pipe( pipe_slow );
10013 %}
10014 
10015 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10016   predicate(!n->as_ShiftV()->is_var_shift());
10017   match(Set dst (URShiftVS (Binary dst src2) mask));
10018   match(Set dst (URShiftVI (Binary dst src2) mask));
10019   match(Set dst (URShiftVL (Binary dst src2) mask));
10020   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10021   ins_encode %{
10022     int vlen_enc = vector_length_encoding(this);
10023     BasicType bt = Matcher::vector_element_basic_type(this);
10024     int opc = this->ideal_Opcode();
10025     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10026                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10027   %}
10028   ins_pipe( pipe_slow );
10029 %}
10030 
10031 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10032   predicate(n->as_ShiftV()->is_var_shift());
10033   match(Set dst (URShiftVS (Binary dst src2) mask));
10034   match(Set dst (URShiftVI (Binary dst src2) mask));
10035   match(Set dst (URShiftVL (Binary dst src2) mask));
10036   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10037   ins_encode %{
10038     int vlen_enc = vector_length_encoding(this);
10039     BasicType bt = Matcher::vector_element_basic_type(this);
10040     int opc = this->ideal_Opcode();
10041     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10042                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10043   %}
10044   ins_pipe( pipe_slow );
10045 %}
10046 
10047 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10048   match(Set dst (MaxV (Binary dst src2) mask));
10049   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10050   ins_encode %{
10051     int vlen_enc = vector_length_encoding(this);
10052     BasicType bt = Matcher::vector_element_basic_type(this);
10053     int opc = this->ideal_Opcode();
10054     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10055                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10056   %}
10057   ins_pipe( pipe_slow );
10058 %}
10059 
10060 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10061   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10062   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10063   ins_encode %{
10064     int vlen_enc = vector_length_encoding(this);
10065     BasicType bt = Matcher::vector_element_basic_type(this);
10066     int opc = this->ideal_Opcode();
10067     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10068                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10069   %}
10070   ins_pipe( pipe_slow );
10071 %}
10072 
10073 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10074   match(Set dst (MinV (Binary dst src2) mask));
10075   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10076   ins_encode %{
10077     int vlen_enc = vector_length_encoding(this);
10078     BasicType bt = Matcher::vector_element_basic_type(this);
10079     int opc = this->ideal_Opcode();
10080     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10081                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10082   %}
10083   ins_pipe( pipe_slow );
10084 %}
10085 
10086 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10087   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10088   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10089   ins_encode %{
10090     int vlen_enc = vector_length_encoding(this);
10091     BasicType bt = Matcher::vector_element_basic_type(this);
10092     int opc = this->ideal_Opcode();
10093     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10094                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10095   %}
10096   ins_pipe( pipe_slow );
10097 %}
10098 
10099 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10100   match(Set dst (VectorRearrange (Binary dst src2) mask));
10101   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10102   ins_encode %{
10103     int vlen_enc = vector_length_encoding(this);
10104     BasicType bt = Matcher::vector_element_basic_type(this);
10105     int opc = this->ideal_Opcode();
10106     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10107                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10108   %}
10109   ins_pipe( pipe_slow );
10110 %}
10111 
10112 instruct vabs_masked(vec dst, kReg mask) %{
10113   match(Set dst (AbsVB dst mask));
10114   match(Set dst (AbsVS dst mask));
10115   match(Set dst (AbsVI dst mask));
10116   match(Set dst (AbsVL dst mask));
10117   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10118   ins_encode %{
10119     int vlen_enc = vector_length_encoding(this);
10120     BasicType bt = Matcher::vector_element_basic_type(this);
10121     int opc = this->ideal_Opcode();
10122     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10123                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10124   %}
10125   ins_pipe( pipe_slow );
10126 %}
10127 
10128 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10129   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10130   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10131   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10132   ins_encode %{
10133     assert(UseFMA, "Needs FMA instructions support.");
10134     int vlen_enc = vector_length_encoding(this);
10135     BasicType bt = Matcher::vector_element_basic_type(this);
10136     int opc = this->ideal_Opcode();
10137     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10138                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10139   %}
10140   ins_pipe( pipe_slow );
10141 %}
10142 
10143 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10144   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10145   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10146   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10147   ins_encode %{
10148     assert(UseFMA, "Needs FMA instructions support.");
10149     int vlen_enc = vector_length_encoding(this);
10150     BasicType bt = Matcher::vector_element_basic_type(this);
10151     int opc = this->ideal_Opcode();
10152     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10153                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10154   %}
10155   ins_pipe( pipe_slow );
10156 %}
10157 
10158 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10159   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10160   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10161   ins_encode %{
10162     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10163     int vlen_enc = vector_length_encoding(this, $src1);
10164     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10165 
10166     // Comparison i
10167     switch (src1_elem_bt) {
10168       case T_BYTE: {
10169         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10170         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10171         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10172         break;
10173       }
10174       case T_SHORT: {
10175         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10176         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10177         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10178         break;
10179       }
10180       case T_INT: {
10181         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10182         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10183         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10184         break;
10185       }
10186       case T_LONG: {
10187         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10188         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10189         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10190         break;
10191       }
10192       case T_FLOAT: {
10193         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10194         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10195         break;
10196       }
10197       case T_DOUBLE: {
10198         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10199         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10200         break;
10201       }
10202       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10203     }
10204   %}
10205   ins_pipe( pipe_slow );
10206 %}
10207 
10208 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10209   predicate(Matcher::vector_length(n) <= 32);
10210   match(Set dst (MaskAll src));
10211   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10212   ins_encode %{
10213     int mask_len = Matcher::vector_length(this);
10214     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10215   %}
10216   ins_pipe( pipe_slow );
10217 %}
10218 
10219 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10220   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10221   match(Set dst (XorVMask src (MaskAll cnt)));
10222   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10223   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10224   ins_encode %{
10225     uint masklen = Matcher::vector_length(this);
10226     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10227   %}
10228   ins_pipe( pipe_slow );
10229 %}
10230 
10231 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10232   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10233             (Matcher::vector_length(n) == 16) ||
10234             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10235   match(Set dst (XorVMask src (MaskAll cnt)));
10236   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10237   ins_encode %{
10238     uint masklen = Matcher::vector_length(this);
10239     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10240   %}
10241   ins_pipe( pipe_slow );
10242 %}
10243 
10244 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10245   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10246   match(Set dst (VectorLongToMask src));
10247   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10248   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10249   ins_encode %{
10250     int mask_len = Matcher::vector_length(this);
10251     int vec_enc  = vector_length_encoding(mask_len);
10252     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10253                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10254   %}
10255   ins_pipe( pipe_slow );
10256 %}
10257 
10258 
10259 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10260   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10261   match(Set dst (VectorLongToMask src));
10262   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10263   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10264   ins_encode %{
10265     int mask_len = Matcher::vector_length(this);
10266     assert(mask_len <= 32, "invalid mask length");
10267     int vec_enc  = vector_length_encoding(mask_len);
10268     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10269                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10270   %}
10271   ins_pipe( pipe_slow );
10272 %}
10273 
10274 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10275   predicate(n->bottom_type()->isa_vectmask());
10276   match(Set dst (VectorLongToMask src));
10277   format %{ "long_to_mask_evex $dst, $src\t!" %}
10278   ins_encode %{
10279     __ kmov($dst$$KRegister, $src$$Register);
10280   %}
10281   ins_pipe( pipe_slow );
10282 %}
10283 
10284 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10285   match(Set dst (AndVMask src1 src2));
10286   match(Set dst (OrVMask src1 src2));
10287   match(Set dst (XorVMask src1 src2));
10288   effect(TEMP kscratch);
10289   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10290   ins_encode %{
10291     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10292     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10293     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10294     uint masklen = Matcher::vector_length(this);
10295     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10296     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10297   %}
10298   ins_pipe( pipe_slow );
10299 %}
10300 
10301 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10302   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10303   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10304   ins_encode %{
10305     int vlen_enc = vector_length_encoding(this);
10306     BasicType bt = Matcher::vector_element_basic_type(this);
10307     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10308                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10309   %}
10310   ins_pipe( pipe_slow );
10311 %}
10312 
10313 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10314   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10315   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10316   ins_encode %{
10317     int vlen_enc = vector_length_encoding(this);
10318     BasicType bt = Matcher::vector_element_basic_type(this);
10319     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10320                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10321   %}
10322   ins_pipe( pipe_slow );
10323 %}
10324 
10325 instruct castMM(kReg dst)
10326 %{
10327   match(Set dst (CastVV dst));
10328 
10329   size(0);
10330   format %{ "# castVV of $dst" %}
10331   ins_encode(/* empty encoding */);
10332   ins_cost(0);
10333   ins_pipe(empty);
10334 %}
10335 
10336 instruct castVV(vec dst)
10337 %{
10338   match(Set dst (CastVV dst));
10339 
10340   size(0);
10341   format %{ "# castVV of $dst" %}
10342   ins_encode(/* empty encoding */);
10343   ins_cost(0);
10344   ins_pipe(empty);
10345 %}
10346 
10347 instruct castVVLeg(legVec dst)
10348 %{
10349   match(Set dst (CastVV dst));
10350 
10351   size(0);
10352   format %{ "# castVV of $dst" %}
10353   ins_encode(/* empty encoding */);
10354   ins_cost(0);
10355   ins_pipe(empty);
10356 %}
10357 
10358 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10359 %{
10360   match(Set dst (IsInfiniteF src));
10361   effect(TEMP ktmp, KILL cr);
10362   format %{ "float_class_check $dst, $src" %}
10363   ins_encode %{
10364     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10365     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10366   %}
10367   ins_pipe(pipe_slow);
10368 %}
10369 
10370 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10371 %{
10372   match(Set dst (IsInfiniteD src));
10373   effect(TEMP ktmp, KILL cr);
10374   format %{ "double_class_check $dst, $src" %}
10375   ins_encode %{
10376     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10377     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10378   %}
10379   ins_pipe(pipe_slow);
10380 %}
10381 
10382 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10383 %{
10384   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10385             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10386   match(Set dst (SaturatingAddV src1 src2));
10387   match(Set dst (SaturatingSubV src1 src2));
10388   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10389   ins_encode %{
10390     int vlen_enc = vector_length_encoding(this);
10391     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10392     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10393                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10394   %}
10395   ins_pipe(pipe_slow);
10396 %}
10397 
10398 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10399 %{
10400   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10401             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10402   match(Set dst (SaturatingAddV src1 src2));
10403   match(Set dst (SaturatingSubV src1 src2));
10404   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10405   ins_encode %{
10406     int vlen_enc = vector_length_encoding(this);
10407     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10408     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10409                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10410   %}
10411   ins_pipe(pipe_slow);
10412 %}
10413 
10414 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10415 %{
10416   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10417             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10418             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10419   match(Set dst (SaturatingAddV src1 src2));
10420   match(Set dst (SaturatingSubV src1 src2));
10421   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10422   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10423   ins_encode %{
10424     int vlen_enc = vector_length_encoding(this);
10425     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10426     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10427                                         $src1$$XMMRegister, $src2$$XMMRegister,
10428                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10429                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10430   %}
10431   ins_pipe(pipe_slow);
10432 %}
10433 
10434 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10435 %{
10436   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10437             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10438             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10439   match(Set dst (SaturatingAddV src1 src2));
10440   match(Set dst (SaturatingSubV src1 src2));
10441   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10442   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10443   ins_encode %{
10444     int vlen_enc = vector_length_encoding(this);
10445     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10446     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10447                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10448                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10449   %}
10450   ins_pipe(pipe_slow);
10451 %}
10452 
10453 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10454 %{
10455   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10456             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10457             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10458   match(Set dst (SaturatingAddV src1 src2));
10459   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10460   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10461   ins_encode %{
10462     int vlen_enc = vector_length_encoding(this);
10463     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10464     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10465                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10466   %}
10467   ins_pipe(pipe_slow);
10468 %}
10469 
10470 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10471 %{
10472   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10473             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10474             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10475   match(Set dst (SaturatingAddV src1 src2));
10476   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10477   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10478   ins_encode %{
10479     int vlen_enc = vector_length_encoding(this);
10480     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10481     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10482                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10483   %}
10484   ins_pipe(pipe_slow);
10485 %}
10486 
10487 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10488 %{
10489   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10490             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10491             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10492   match(Set dst (SaturatingSubV src1 src2));
10493   effect(TEMP ktmp);
10494   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10495   ins_encode %{
10496     int vlen_enc = vector_length_encoding(this);
10497     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10498     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10499                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10500   %}
10501   ins_pipe(pipe_slow);
10502 %}
10503 
10504 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10505 %{
10506   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10507             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10508             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10509   match(Set dst (SaturatingSubV src1 src2));
10510   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10511   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10512   ins_encode %{
10513     int vlen_enc = vector_length_encoding(this);
10514     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10515     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10516                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10517   %}
10518   ins_pipe(pipe_slow);
10519 %}
10520 
10521 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10522 %{
10523   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10524             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10525   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10526   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10527   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10528   ins_encode %{
10529     int vlen_enc = vector_length_encoding(this);
10530     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10531     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10532                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10533   %}
10534   ins_pipe(pipe_slow);
10535 %}
10536 
10537 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10538 %{
10539   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10540             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10541   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10542   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10543   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10544   ins_encode %{
10545     int vlen_enc = vector_length_encoding(this);
10546     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10547     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10548                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10549   %}
10550   ins_pipe(pipe_slow);
10551 %}
10552 
10553 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10554   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10555             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10556   match(Set dst (SaturatingAddV (Binary dst src) mask));
10557   match(Set dst (SaturatingSubV (Binary dst src) mask));
10558   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10559   ins_encode %{
10560     int vlen_enc = vector_length_encoding(this);
10561     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10562     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10563                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10564   %}
10565   ins_pipe( pipe_slow );
10566 %}
10567 
10568 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10569   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10570             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10571   match(Set dst (SaturatingAddV (Binary dst src) mask));
10572   match(Set dst (SaturatingSubV (Binary dst src) mask));
10573   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10574   ins_encode %{
10575     int vlen_enc = vector_length_encoding(this);
10576     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10577     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10578                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10579   %}
10580   ins_pipe( pipe_slow );
10581 %}
10582 
10583 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10584   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10585             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10586   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10587   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10588   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10589   ins_encode %{
10590     int vlen_enc = vector_length_encoding(this);
10591     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10592     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10593                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10594   %}
10595   ins_pipe( pipe_slow );
10596 %}
10597 
10598 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10599   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10600             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10601   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10602   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10603   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10604   ins_encode %{
10605     int vlen_enc = vector_length_encoding(this);
10606     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10607     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10608                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10609   %}
10610   ins_pipe( pipe_slow );
10611 %}
10612 
10613 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10614 %{
10615   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10616   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10617   ins_encode %{
10618     int vlen_enc = vector_length_encoding(this);
10619     BasicType bt = Matcher::vector_element_basic_type(this);
10620     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10621   %}
10622   ins_pipe(pipe_slow);
10623 %}
10624 
10625 instruct reinterpretS2HF(regF dst, rRegI src)
10626 %{
10627   match(Set dst (ReinterpretS2HF src));
10628   format %{ "vmovw $dst, $src" %}
10629   ins_encode %{
10630     __ vmovw($dst$$XMMRegister, $src$$Register);
10631   %}
10632   ins_pipe(pipe_slow);
10633 %}
10634 
10635 instruct reinterpretHF2S(rRegI dst, regF src)
10636 %{
10637   match(Set dst (ReinterpretHF2S src));
10638   format %{ "vmovw $dst, $src" %}
10639   ins_encode %{
10640     __ vmovw($dst$$Register, $src$$XMMRegister);
10641   %}
10642   ins_pipe(pipe_slow);
10643 %}
10644 
10645 instruct convF2HFAndS2HF(regF dst, regF src)
10646 %{
10647   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10648   format %{ "convF2HFAndS2HF $dst, $src" %}
10649   ins_encode %{
10650     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10651   %}
10652   ins_pipe(pipe_slow);
10653 %}
10654 
10655 instruct convHF2SAndHF2F(regF dst, regF src)
10656 %{
10657   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10658   format %{ "convHF2SAndHF2F $dst, $src" %}
10659   ins_encode %{
10660     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10661   %}
10662   ins_pipe(pipe_slow);
10663 %}
10664 
10665 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10666 %{
10667   match(Set dst (SqrtHF src));
10668   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10669   ins_encode %{
10670     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10671   %}
10672   ins_pipe(pipe_slow);
10673 %}
10674 
10675 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10676 %{
10677   match(Set dst (AddHF src1 src2));
10678   match(Set dst (DivHF src1 src2));
10679   match(Set dst (MulHF src1 src2));
10680   match(Set dst (SubHF src1 src2));
10681   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10682   ins_encode %{
10683     int opcode = this->ideal_Opcode();
10684     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10685   %}
10686   ins_pipe(pipe_slow);
10687 %}
10688 
10689 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10690 %{
10691   match(Set dst (MaxHF src1 src2));
10692   match(Set dst (MinHF src1 src2));
10693   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10694   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10695   ins_encode %{
10696     int opcode = this->ideal_Opcode();
10697     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10698                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10699   %}
10700   ins_pipe( pipe_slow );
10701 %}
10702 
10703 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10704 %{
10705   match(Set dst (FmaHF  src2 (Binary dst src1)));
10706   effect(DEF dst);
10707   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10708   ins_encode %{
10709     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10710   %}
10711   ins_pipe( pipe_slow );
10712 %}
10713 
10714 
10715 instruct vector_sqrt_HF_reg(vec dst, vec src)
10716 %{
10717   match(Set dst (SqrtVHF src));
10718   format %{ "vector_sqrt_fp16 $dst, $src" %}
10719   ins_encode %{
10720     int vlen_enc = vector_length_encoding(this);
10721     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10722   %}
10723   ins_pipe(pipe_slow);
10724 %}
10725 
10726 instruct vector_sqrt_HF_mem(vec dst, memory src)
10727 %{
10728   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10729   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10730   ins_encode %{
10731     int vlen_enc = vector_length_encoding(this);
10732     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10733   %}
10734   ins_pipe(pipe_slow);
10735 %}
10736 
10737 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10738 %{
10739   match(Set dst (AddVHF src1 src2));
10740   match(Set dst (DivVHF src1 src2));
10741   match(Set dst (MulVHF src1 src2));
10742   match(Set dst (SubVHF src1 src2));
10743   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10744   ins_encode %{
10745     int vlen_enc = vector_length_encoding(this);
10746     int opcode = this->ideal_Opcode();
10747     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10748   %}
10749   ins_pipe(pipe_slow);
10750 %}
10751 
10752 
10753 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10754 %{
10755   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10756   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10757   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10758   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10759   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10760   ins_encode %{
10761     int vlen_enc = vector_length_encoding(this);
10762     int opcode = this->ideal_Opcode();
10763     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10764   %}
10765   ins_pipe(pipe_slow);
10766 %}
10767 
10768 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10769 %{
10770   match(Set dst (FmaVHF src2 (Binary dst src1)));
10771   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10772   ins_encode %{
10773     int vlen_enc = vector_length_encoding(this);
10774     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10775   %}
10776   ins_pipe( pipe_slow );
10777 %}
10778 
10779 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10780 %{
10781   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10782   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10783   ins_encode %{
10784     int vlen_enc = vector_length_encoding(this);
10785     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10786   %}
10787   ins_pipe( pipe_slow );
10788 %}
10789 
10790 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
10791 %{
10792   match(Set dst (MinVHF src1 src2));
10793   match(Set dst (MaxVHF src1 src2));
10794   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10795   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10796   ins_encode %{
10797     int vlen_enc = vector_length_encoding(this);
10798     int opcode = this->ideal_Opcode();
10799     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
10800                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10801   %}
10802   ins_pipe( pipe_slow );
10803 %}