1 //
    2 // Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   C2_MacroAssembler _masm(&cbuf);
 1314   address base = __ start_a_stub(size_exception_handler());
 1315   if (base == nullptr) {
 1316     ciEnv::current()->record_failure("CodeCache is full");
 1317     return 0;  // CodeBuffer::expand failed
 1318   }
 1319   int offset = __ offset();
 1320   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1321   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1322   __ end_a_stub();
 1323   return offset;
 1324 }
 1325 
 1326 // Emit deopt handler code.
 1327 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1328 
 1329   // Note that the code buffer's insts_mark is always relative to insts.
 1330   // That's why we must use the macroassembler to generate a handler.
 1331   C2_MacroAssembler _masm(&cbuf);
 1332   address base = __ start_a_stub(size_deopt_handler());
 1333   if (base == nullptr) {
 1334     ciEnv::current()->record_failure("CodeCache is full");
 1335     return 0;  // CodeBuffer::expand failed
 1336   }
 1337   int offset = __ offset();
 1338 
 1339 #ifdef _LP64
 1340   address the_pc = (address) __ pc();
 1341   Label next;
 1342   // push a "the_pc" on the stack without destroying any registers
 1343   // as they all may be live.
 1344 
 1345   // push address of "next"
 1346   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1347   __ bind(next);
 1348   // adjust it so it matches "the_pc"
 1349   __ subptr(Address(rsp, 0), __ offset() - offset);
 1350 #else
 1351   InternalAddress here(__ pc());
 1352   __ pushptr(here.addr(), noreg);
 1353 #endif
 1354 
 1355   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1356   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1357   __ end_a_stub();
 1358   return offset;
 1359 }
 1360 
 1361 Assembler::Width widthForType(BasicType bt) {
 1362   if (bt == T_BYTE) {
 1363     return Assembler::B;
 1364   } else if (bt == T_SHORT) {
 1365     return Assembler::W;
 1366   } else if (bt == T_INT) {
 1367     return Assembler::D;
 1368   } else {
 1369     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1370     return Assembler::Q;
 1371   }
 1372 }
 1373 
 1374 //=============================================================================
 1375 
 1376   // Float masks come from different places depending on platform.
 1377 #ifdef _LP64
 1378   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1379   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1380   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1381   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1382 #else
 1383   static address float_signmask()  { return (address)float_signmask_pool; }
 1384   static address float_signflip()  { return (address)float_signflip_pool; }
 1385   static address double_signmask() { return (address)double_signmask_pool; }
 1386   static address double_signflip() { return (address)double_signflip_pool; }
 1387 #endif
 1388   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1389   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1390   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1391   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1392   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1393   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1394   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1395   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1396   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1397   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1398   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1399   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1400   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1401   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1402   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1403 
 1404 //=============================================================================
 1405 bool Matcher::match_rule_supported(int opcode) {
 1406   if (!has_match_rule(opcode)) {
 1407     return false; // no match rule present
 1408   }
 1409   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1410   switch (opcode) {
 1411     case Op_AbsVL:
 1412     case Op_StoreVectorScatter:
 1413       if (UseAVX < 3) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountI:
 1418     case Op_PopCountL:
 1419       if (!UsePopCountInstruction) {
 1420         return false;
 1421       }
 1422       break;
 1423     case Op_PopCountVI:
 1424       if (UseAVX < 2) {
 1425         return false;
 1426       }
 1427       break;
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       if (!VM_Version::supports_cx8()) {
 1514         return false;
 1515       }
 1516       break;
 1517     case Op_StrIndexOf:
 1518       if (!UseSSE42Intrinsics) {
 1519         return false;
 1520       }
 1521       break;
 1522     case Op_StrIndexOfChar:
 1523       if (!UseSSE42Intrinsics) {
 1524         return false;
 1525       }
 1526       break;
 1527     case Op_OnSpinWait:
 1528       if (VM_Version::supports_on_spin_wait() == false) {
 1529         return false;
 1530       }
 1531       break;
 1532     case Op_MulVB:
 1533     case Op_LShiftVB:
 1534     case Op_RShiftVB:
 1535     case Op_URShiftVB:
 1536     case Op_VectorInsert:
 1537     case Op_VectorLoadMask:
 1538     case Op_VectorStoreMask:
 1539     case Op_VectorBlend:
 1540       if (UseSSE < 4) {
 1541         return false;
 1542       }
 1543       break;
 1544 #ifdef _LP64
 1545     case Op_MaxD:
 1546     case Op_MaxF:
 1547     case Op_MinD:
 1548     case Op_MinF:
 1549       if (UseAVX < 1) { // enabled for AVX only
 1550         return false;
 1551       }
 1552       break;
 1553 #endif
 1554     case Op_CacheWB:
 1555     case Op_CacheWBPreSync:
 1556     case Op_CacheWBPostSync:
 1557       if (!VM_Version::supports_data_cache_line_flush()) {
 1558         return false;
 1559       }
 1560       break;
 1561     case Op_ExtractB:
 1562     case Op_ExtractL:
 1563     case Op_ExtractI:
 1564     case Op_RoundDoubleMode:
 1565       if (UseSSE < 4) {
 1566         return false;
 1567       }
 1568       break;
 1569     case Op_RoundDoubleModeV:
 1570       if (VM_Version::supports_avx() == false) {
 1571         return false; // 128bit vroundpd is not available
 1572       }
 1573       break;
 1574     case Op_LoadVectorGather:
 1575       if (UseAVX < 2) {
 1576         return false;
 1577       }
 1578       break;
 1579     case Op_FmaF:
 1580     case Op_FmaD:
 1581     case Op_FmaVD:
 1582     case Op_FmaVF:
 1583       if (!UseFMA) {
 1584         return false;
 1585       }
 1586       break;
 1587     case Op_MacroLogicV:
 1588       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1589         return false;
 1590       }
 1591       break;
 1592 
 1593     case Op_VectorCmpMasked:
 1594     case Op_VectorMaskGen:
 1595       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1596         return false;
 1597       }
 1598       break;
 1599     case Op_VectorMaskFirstTrue:
 1600     case Op_VectorMaskLastTrue:
 1601     case Op_VectorMaskTrueCount:
 1602     case Op_VectorMaskToLong:
 1603       if (!is_LP64 || UseAVX < 1) {
 1604          return false;
 1605       }
 1606       break;
 1607     case Op_RoundF:
 1608     case Op_RoundD:
 1609       if (!is_LP64) {
 1610         return false;
 1611       }
 1612       break;
 1613     case Op_CopySignD:
 1614     case Op_CopySignF:
 1615       if (UseAVX < 3 || !is_LP64)  {
 1616         return false;
 1617       }
 1618       if (!VM_Version::supports_avx512vl()) {
 1619         return false;
 1620       }
 1621       break;
 1622 #ifndef _LP64
 1623     case Op_AddReductionVF:
 1624     case Op_AddReductionVD:
 1625     case Op_MulReductionVF:
 1626     case Op_MulReductionVD:
 1627       if (UseSSE < 1) { // requires at least SSE
 1628         return false;
 1629       }
 1630       break;
 1631     case Op_MulAddVS2VI:
 1632     case Op_RShiftVL:
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if (UseSSE < 2) {
 1636         return false;
 1637       }
 1638       break;
 1639 #endif // !LP64
 1640     case Op_CompressBits:
 1641       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1642         return false;
 1643       }
 1644       break;
 1645     case Op_ExpandBits:
 1646       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_SignumF:
 1651       if (UseSSE < 1) {
 1652         return false;
 1653       }
 1654       break;
 1655     case Op_SignumD:
 1656       if (UseSSE < 2) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_CompressM:
 1661       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1662         return false;
 1663       }
 1664       break;
 1665     case Op_CompressV:
 1666     case Op_ExpandV:
 1667       if (!VM_Version::supports_avx512vl()) {
 1668         return false;
 1669       }
 1670       break;
 1671     case Op_SqrtF:
 1672       if (UseSSE < 1) {
 1673         return false;
 1674       }
 1675       break;
 1676     case Op_SqrtD:
 1677 #ifdef _LP64
 1678       if (UseSSE < 2) {
 1679         return false;
 1680       }
 1681 #else
 1682       // x86_32.ad has a special match rule for SqrtD.
 1683       // Together with common x86 rules, this handles all UseSSE cases.
 1684 #endif
 1685       break;
 1686     case Op_ConvF2HF:
 1687     case Op_ConvHF2F:
 1688       if (!VM_Version::supports_float16()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_VectorCastF2HF:
 1693     case Op_VectorCastHF2F:
 1694       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1695         return false;
 1696       }
 1697       break;
 1698   }
 1699   return true;  // Match rules are supported by default.
 1700 }
 1701 
 1702 //------------------------------------------------------------------------
 1703 
 1704 static inline bool is_pop_count_instr_target(BasicType bt) {
 1705   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1706          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1707 }
 1708 
 1709 bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) {
 1710   return match_rule_supported_vector(opcode, vlen, bt);
 1711 }
 1712 
 1713 // Identify extra cases that we might want to provide match rules for vector nodes and
 1714 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1715 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1716   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1717   if (!match_rule_supported(opcode)) {
 1718     return false;
 1719   }
 1720   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1721   //   * SSE2 supports 128bit vectors for all types;
 1722   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1723   //   * AVX2 supports 256bit vectors for all types;
 1724   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1725   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1726   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1727   // And MaxVectorSize is taken into account as well.
 1728   if (!vector_size_supported(bt, vlen)) {
 1729     return false;
 1730   }
 1731   // Special cases which require vector length follow:
 1732   //   * implementation limitations
 1733   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1734   //   * 128bit vroundpd instruction is present only in AVX1
 1735   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1736   switch (opcode) {
 1737     case Op_AbsVF:
 1738     case Op_NegVF:
 1739       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1740         return false; // 512bit vandps and vxorps are not available
 1741       }
 1742       break;
 1743     case Op_AbsVD:
 1744     case Op_NegVD:
 1745       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1746         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1747       }
 1748       break;
 1749     case Op_RotateRightV:
 1750     case Op_RotateLeftV:
 1751       if (bt != T_INT && bt != T_LONG) {
 1752         return false;
 1753       } // fallthrough
 1754     case Op_MacroLogicV:
 1755       if (!VM_Version::supports_evex() ||
 1756           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1757         return false;
 1758       }
 1759       break;
 1760     case Op_ClearArray:
 1761     case Op_VectorMaskGen:
 1762     case Op_VectorCmpMasked:
 1763       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1764         return false;
 1765       }
 1766       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1767         return false;
 1768       }
 1769       break;
 1770     case Op_LoadVectorMasked:
 1771     case Op_StoreVectorMasked:
 1772       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1773         return false;
 1774       }
 1775       break;
 1776     case Op_MaxV:
 1777     case Op_MinV:
 1778       if (UseSSE < 4 && is_integral_type(bt)) {
 1779         return false;
 1780       }
 1781       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1782           // Float/Double intrinsics are enabled for AVX family currently.
 1783           if (UseAVX == 0) {
 1784             return false;
 1785           }
 1786           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1787             return false;
 1788           }
 1789       }
 1790       break;
 1791     case Op_CallLeafVector:
 1792       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1793         return false;
 1794       }
 1795       break;
 1796     case Op_AddReductionVI:
 1797       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1798         return false;
 1799       }
 1800       // fallthrough
 1801     case Op_AndReductionV:
 1802     case Op_OrReductionV:
 1803     case Op_XorReductionV:
 1804       if (is_subword_type(bt) && (UseSSE < 4)) {
 1805         return false;
 1806       }
 1807 #ifndef _LP64
 1808       if (bt == T_BYTE || bt == T_LONG) {
 1809         return false;
 1810       }
 1811 #endif
 1812       break;
 1813 #ifndef _LP64
 1814     case Op_VectorInsert:
 1815       if (bt == T_LONG || bt == T_DOUBLE) {
 1816         return false;
 1817       }
 1818       break;
 1819 #endif
 1820     case Op_MinReductionV:
 1821     case Op_MaxReductionV:
 1822       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1823         return false;
 1824       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1825         return false;
 1826       }
 1827       // Float/Double intrinsics enabled for AVX family.
 1828       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1829         return false;
 1830       }
 1831       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1832         return false;
 1833       }
 1834 #ifndef _LP64
 1835       if (bt == T_BYTE || bt == T_LONG) {
 1836         return false;
 1837       }
 1838 #endif
 1839       break;
 1840     case Op_VectorTest:
 1841       if (UseSSE < 4) {
 1842         return false; // Implementation limitation
 1843       } else if (size_in_bits < 32) {
 1844         return false; // Implementation limitation
 1845       }
 1846       break;
 1847     case Op_VectorLoadShuffle:
 1848     case Op_VectorRearrange:
 1849       if(vlen == 2) {
 1850         return false; // Implementation limitation due to how shuffle is loaded
 1851       } else if (size_in_bits == 256 && UseAVX < 2) {
 1852         return false; // Implementation limitation
 1853       }
 1854       break;
 1855     case Op_VectorLoadMask:
 1856     case Op_VectorMaskCast:
 1857       if (size_in_bits == 256 && UseAVX < 2) {
 1858         return false; // Implementation limitation
 1859       }
 1860       // fallthrough
 1861     case Op_VectorStoreMask:
 1862       if (vlen == 2) {
 1863         return false; // Implementation limitation
 1864       }
 1865       break;
 1866     case Op_PopulateIndex:
 1867       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1868         return false;
 1869       }
 1870       break;
 1871     case Op_VectorCastB2X:
 1872     case Op_VectorCastS2X:
 1873     case Op_VectorCastI2X:
 1874       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1875         return false;
 1876       }
 1877       break;
 1878     case Op_VectorCastL2X:
 1879       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1880         return false;
 1881       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1882         return false;
 1883       }
 1884       break;
 1885     case Op_VectorCastF2X: {
 1886         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1887         // happen after intermediate conversion to integer and special handling
 1888         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1889         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1890         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1891           return false;
 1892         }
 1893       }
 1894       // fallthrough
 1895     case Op_VectorCastD2X:
 1896       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1897         return false;
 1898       }
 1899       break;
 1900     case Op_VectorCastF2HF:
 1901     case Op_VectorCastHF2F:
 1902       if (!VM_Version::supports_f16c() &&
 1903          ((!VM_Version::supports_evex() ||
 1904          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1905         return false;
 1906       }
 1907       break;
 1908     case Op_RoundVD:
 1909       if (!VM_Version::supports_avx512dq()) {
 1910         return false;
 1911       }
 1912       break;
 1913     case Op_MulReductionVI:
 1914       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1915         return false;
 1916       }
 1917       break;
 1918     case Op_LoadVectorGatherMasked:
 1919     case Op_StoreVectorScatterMasked:
 1920     case Op_StoreVectorScatter:
 1921       if (is_subword_type(bt)) {
 1922         return false;
 1923       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1924         return false;
 1925       }
 1926       // fallthrough
 1927     case Op_LoadVectorGather:
 1928       if (size_in_bits == 64 ) {
 1929         return false;
 1930       }
 1931       break;
 1932     case Op_MaskAll:
 1933       if (!VM_Version::supports_evex()) {
 1934         return false;
 1935       }
 1936       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1937         return false;
 1938       }
 1939       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1940         return false;
 1941       }
 1942       break;
 1943     case Op_VectorMaskCmp:
 1944       if (vlen < 2 || size_in_bits < 32) {
 1945         return false;
 1946       }
 1947       break;
 1948     case Op_CompressM:
 1949       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1950         return false;
 1951       }
 1952       break;
 1953     case Op_CompressV:
 1954     case Op_ExpandV:
 1955       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1956         return false;
 1957       }
 1958       if (size_in_bits < 128 ) {
 1959         return false;
 1960       }
 1961       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1962         return false;
 1963       }
 1964       break;
 1965     case Op_VectorLongToMask:
 1966       if (UseAVX < 1 || !is_LP64) {
 1967         return false;
 1968       }
 1969       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1970         return false;
 1971       }
 1972       break;
 1973     case Op_SignumVD:
 1974     case Op_SignumVF:
 1975       if (UseAVX < 1) {
 1976         return false;
 1977       }
 1978       break;
 1979     case Op_PopCountVI:
 1980     case Op_PopCountVL: {
 1981         if (!is_pop_count_instr_target(bt) &&
 1982             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1983           return false;
 1984         }
 1985       }
 1986       break;
 1987     case Op_ReverseV:
 1988     case Op_ReverseBytesV:
 1989       if (UseAVX < 2) {
 1990         return false;
 1991       }
 1992       break;
 1993     case Op_CountTrailingZerosV:
 1994     case Op_CountLeadingZerosV:
 1995       if (UseAVX < 2) {
 1996         return false;
 1997       }
 1998       break;
 1999   }
 2000   return true;  // Per default match rules are supported.
 2001 }
 2002 
 2003 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2004   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2005   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2006   // of their non-masked counterpart with mask edge being the differentiator.
 2007   // This routine does a strict check on the existence of masked operation patterns
 2008   // by returning a default false value for all the other opcodes apart from the
 2009   // ones whose masked instruction patterns are defined in this file.
 2010   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2011     return false;
 2012   }
 2013 
 2014   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2015   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2016   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2017     return false;
 2018   }
 2019   switch(opcode) {
 2020     // Unary masked operations
 2021     case Op_AbsVB:
 2022     case Op_AbsVS:
 2023       if(!VM_Version::supports_avx512bw()) {
 2024         return false;  // Implementation limitation
 2025       }
 2026     case Op_AbsVI:
 2027     case Op_AbsVL:
 2028       return true;
 2029 
 2030     // Ternary masked operations
 2031     case Op_FmaVF:
 2032     case Op_FmaVD:
 2033       return true;
 2034 
 2035     case Op_MacroLogicV:
 2036       if(bt != T_INT && bt != T_LONG) {
 2037         return false;
 2038       }
 2039       return true;
 2040 
 2041     // Binary masked operations
 2042     case Op_AddVB:
 2043     case Op_AddVS:
 2044     case Op_SubVB:
 2045     case Op_SubVS:
 2046     case Op_MulVS:
 2047     case Op_LShiftVS:
 2048     case Op_RShiftVS:
 2049     case Op_URShiftVS:
 2050       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2051       if (!VM_Version::supports_avx512bw()) {
 2052         return false;  // Implementation limitation
 2053       }
 2054       return true;
 2055 
 2056     case Op_MulVL:
 2057       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2058       if (!VM_Version::supports_avx512dq()) {
 2059         return false;  // Implementation limitation
 2060       }
 2061       return true;
 2062 
 2063     case Op_AndV:
 2064     case Op_OrV:
 2065     case Op_XorV:
 2066     case Op_RotateRightV:
 2067     case Op_RotateLeftV:
 2068       if (bt != T_INT && bt != T_LONG) {
 2069         return false; // Implementation limitation
 2070       }
 2071       return true;
 2072 
 2073     case Op_VectorLoadMask:
 2074       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2075       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2076         return false;
 2077       }
 2078       return true;
 2079 
 2080     case Op_AddVI:
 2081     case Op_AddVL:
 2082     case Op_AddVF:
 2083     case Op_AddVD:
 2084     case Op_SubVI:
 2085     case Op_SubVL:
 2086     case Op_SubVF:
 2087     case Op_SubVD:
 2088     case Op_MulVI:
 2089     case Op_MulVF:
 2090     case Op_MulVD:
 2091     case Op_DivVF:
 2092     case Op_DivVD:
 2093     case Op_SqrtVF:
 2094     case Op_SqrtVD:
 2095     case Op_LShiftVI:
 2096     case Op_LShiftVL:
 2097     case Op_RShiftVI:
 2098     case Op_RShiftVL:
 2099     case Op_URShiftVI:
 2100     case Op_URShiftVL:
 2101     case Op_LoadVectorMasked:
 2102     case Op_StoreVectorMasked:
 2103     case Op_LoadVectorGatherMasked:
 2104     case Op_StoreVectorScatterMasked:
 2105       return true;
 2106 
 2107     case Op_MaxV:
 2108     case Op_MinV:
 2109       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2110         return false; // Implementation limitation
 2111       }
 2112       if (is_floating_point_type(bt)) {
 2113         return false; // Implementation limitation
 2114       }
 2115       return true;
 2116 
 2117     case Op_VectorMaskCmp:
 2118       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2119         return false; // Implementation limitation
 2120       }
 2121       return true;
 2122 
 2123     case Op_VectorRearrange:
 2124       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2125         return false; // Implementation limitation
 2126       }
 2127       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2128         return false; // Implementation limitation
 2129       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2130         return false; // Implementation limitation
 2131       }
 2132       return true;
 2133 
 2134     // Binary Logical operations
 2135     case Op_AndVMask:
 2136     case Op_OrVMask:
 2137     case Op_XorVMask:
 2138       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2139         return false; // Implementation limitation
 2140       }
 2141       return true;
 2142 
 2143     case Op_PopCountVI:
 2144     case Op_PopCountVL:
 2145       if (!is_pop_count_instr_target(bt)) {
 2146         return false;
 2147       }
 2148       return true;
 2149 
 2150     case Op_MaskAll:
 2151       return true;
 2152 
 2153     case Op_CountLeadingZerosV:
 2154       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2155         return true;
 2156       }
 2157     default:
 2158       return false;
 2159   }
 2160 }
 2161 
 2162 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2163   return false;
 2164 }
 2165 
 2166 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2167   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2168   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2169   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2170       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2171     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2172     return new legVecZOper();
 2173   }
 2174   if (legacy) {
 2175     switch (ideal_reg) {
 2176       case Op_VecS: return new legVecSOper();
 2177       case Op_VecD: return new legVecDOper();
 2178       case Op_VecX: return new legVecXOper();
 2179       case Op_VecY: return new legVecYOper();
 2180       case Op_VecZ: return new legVecZOper();
 2181     }
 2182   } else {
 2183     switch (ideal_reg) {
 2184       case Op_VecS: return new vecSOper();
 2185       case Op_VecD: return new vecDOper();
 2186       case Op_VecX: return new vecXOper();
 2187       case Op_VecY: return new vecYOper();
 2188       case Op_VecZ: return new vecZOper();
 2189     }
 2190   }
 2191   ShouldNotReachHere();
 2192   return nullptr;
 2193 }
 2194 
 2195 bool Matcher::is_reg2reg_move(MachNode* m) {
 2196   switch (m->rule()) {
 2197     case MoveVec2Leg_rule:
 2198     case MoveLeg2Vec_rule:
 2199     case MoveF2VL_rule:
 2200     case MoveF2LEG_rule:
 2201     case MoveVL2F_rule:
 2202     case MoveLEG2F_rule:
 2203     case MoveD2VL_rule:
 2204     case MoveD2LEG_rule:
 2205     case MoveVL2D_rule:
 2206     case MoveLEG2D_rule:
 2207       return true;
 2208     default:
 2209       return false;
 2210   }
 2211 }
 2212 
 2213 bool Matcher::is_generic_vector(MachOper* opnd) {
 2214   switch (opnd->opcode()) {
 2215     case VEC:
 2216     case LEGVEC:
 2217       return true;
 2218     default:
 2219       return false;
 2220   }
 2221 }
 2222 
 2223 //------------------------------------------------------------------------
 2224 
 2225 const RegMask* Matcher::predicate_reg_mask(void) {
 2226   return &_VECTMASK_REG_mask;
 2227 }
 2228 
 2229 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2230   return new TypeVectMask(elemTy, length);
 2231 }
 2232 
 2233 // Max vector size in bytes. 0 if not supported.
 2234 int Matcher::vector_width_in_bytes(BasicType bt) {
 2235   assert(is_java_primitive(bt), "only primitive type vectors");
 2236   if (UseSSE < 2) return 0;
 2237   // SSE2 supports 128bit vectors for all types.
 2238   // AVX2 supports 256bit vectors for all types.
 2239   // AVX2/EVEX supports 512bit vectors for all types.
 2240   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2241   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2242   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2243     size = (UseAVX > 2) ? 64 : 32;
 2244   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2245     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2246   // Use flag to limit vector size.
 2247   size = MIN2(size,(int)MaxVectorSize);
 2248   // Minimum 2 values in vector (or 4 for bytes).
 2249   switch (bt) {
 2250   case T_DOUBLE:
 2251   case T_LONG:
 2252     if (size < 16) return 0;
 2253     break;
 2254   case T_FLOAT:
 2255   case T_INT:
 2256     if (size < 8) return 0;
 2257     break;
 2258   case T_BOOLEAN:
 2259     if (size < 4) return 0;
 2260     break;
 2261   case T_CHAR:
 2262     if (size < 4) return 0;
 2263     break;
 2264   case T_BYTE:
 2265     if (size < 4) return 0;
 2266     break;
 2267   case T_SHORT:
 2268     if (size < 4) return 0;
 2269     break;
 2270   default:
 2271     ShouldNotReachHere();
 2272   }
 2273   return size;
 2274 }
 2275 
 2276 // Limits on vector size (number of elements) loaded into vector.
 2277 int Matcher::max_vector_size(const BasicType bt) {
 2278   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2279 }
 2280 int Matcher::min_vector_size(const BasicType bt) {
 2281   int max_size = max_vector_size(bt);
 2282   // Min size which can be loaded into vector is 4 bytes.
 2283   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2284   // Support for calling svml double64 vectors
 2285   if (bt == T_DOUBLE) {
 2286     size = 1;
 2287   }
 2288   return MIN2(size,max_size);
 2289 }
 2290 
 2291 int Matcher::superword_max_vector_size(const BasicType bt) {
 2292   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2293   // by default on Cascade Lake
 2294   if (VM_Version::is_default_intel_cascade_lake()) {
 2295     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2296   }
 2297   return Matcher::max_vector_size(bt);
 2298 }
 2299 
 2300 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2301   return -1;
 2302 }
 2303 
 2304 // Vector ideal reg corresponding to specified size in bytes
 2305 uint Matcher::vector_ideal_reg(int size) {
 2306   assert(MaxVectorSize >= size, "");
 2307   switch(size) {
 2308     case  4: return Op_VecS;
 2309     case  8: return Op_VecD;
 2310     case 16: return Op_VecX;
 2311     case 32: return Op_VecY;
 2312     case 64: return Op_VecZ;
 2313   }
 2314   ShouldNotReachHere();
 2315   return 0;
 2316 }
 2317 
 2318 // Check for shift by small constant as well
 2319 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2320   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2321       shift->in(2)->get_int() <= 3 &&
 2322       // Are there other uses besides address expressions?
 2323       !matcher->is_visited(shift)) {
 2324     address_visited.set(shift->_idx); // Flag as address_visited
 2325     mstack.push(shift->in(2), Matcher::Visit);
 2326     Node *conv = shift->in(1);
 2327 #ifdef _LP64
 2328     // Allow Matcher to match the rule which bypass
 2329     // ConvI2L operation for an array index on LP64
 2330     // if the index value is positive.
 2331     if (conv->Opcode() == Op_ConvI2L &&
 2332         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2333         // Are there other uses besides address expressions?
 2334         !matcher->is_visited(conv)) {
 2335       address_visited.set(conv->_idx); // Flag as address_visited
 2336       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2337     } else
 2338 #endif
 2339       mstack.push(conv, Matcher::Pre_Visit);
 2340     return true;
 2341   }
 2342   return false;
 2343 }
 2344 
 2345 // This function identifies sub-graphs in which a 'load' node is
 2346 // input to two different nodes, and such that it can be matched
 2347 // with BMI instructions like blsi, blsr, etc.
 2348 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2349 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2350 // refers to the same node.
 2351 //
 2352 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2353 // This is a temporary solution until we make DAGs expressible in ADL.
 2354 template<typename ConType>
 2355 class FusedPatternMatcher {
 2356   Node* _op1_node;
 2357   Node* _mop_node;
 2358   int _con_op;
 2359 
 2360   static int match_next(Node* n, int next_op, int next_op_idx) {
 2361     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2362       return -1;
 2363     }
 2364 
 2365     if (next_op_idx == -1) { // n is commutative, try rotations
 2366       if (n->in(1)->Opcode() == next_op) {
 2367         return 1;
 2368       } else if (n->in(2)->Opcode() == next_op) {
 2369         return 2;
 2370       }
 2371     } else {
 2372       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2373       if (n->in(next_op_idx)->Opcode() == next_op) {
 2374         return next_op_idx;
 2375       }
 2376     }
 2377     return -1;
 2378   }
 2379 
 2380  public:
 2381   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2382     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2383 
 2384   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2385              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2386              typename ConType::NativeType con_value) {
 2387     if (_op1_node->Opcode() != op1) {
 2388       return false;
 2389     }
 2390     if (_mop_node->outcnt() > 2) {
 2391       return false;
 2392     }
 2393     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2394     if (op1_op2_idx == -1) {
 2395       return false;
 2396     }
 2397     // Memory operation must be the other edge
 2398     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2399 
 2400     // Check that the mop node is really what we want
 2401     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2402       Node* op2_node = _op1_node->in(op1_op2_idx);
 2403       if (op2_node->outcnt() > 1) {
 2404         return false;
 2405       }
 2406       assert(op2_node->Opcode() == op2, "Should be");
 2407       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2408       if (op2_con_idx == -1) {
 2409         return false;
 2410       }
 2411       // Memory operation must be the other edge
 2412       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2413       // Check that the memory operation is the same node
 2414       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2415         // Now check the constant
 2416         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2417         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2418           return true;
 2419         }
 2420       }
 2421     }
 2422     return false;
 2423   }
 2424 };
 2425 
 2426 static bool is_bmi_pattern(Node* n, Node* m) {
 2427   assert(UseBMI1Instructions, "sanity");
 2428   if (n != nullptr && m != nullptr) {
 2429     if (m->Opcode() == Op_LoadI) {
 2430       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2431       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2432              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2433              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2434     } else if (m->Opcode() == Op_LoadL) {
 2435       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2436       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2437              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2438              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2439     }
 2440   }
 2441   return false;
 2442 }
 2443 
 2444 // Should the matcher clone input 'm' of node 'n'?
 2445 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2446   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2447   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2448     mstack.push(m, Visit);
 2449     return true;
 2450   }
 2451   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2452     mstack.push(m, Visit);           // m = ShiftCntV
 2453     return true;
 2454   }
 2455   return false;
 2456 }
 2457 
 2458 // Should the Matcher clone shifts on addressing modes, expecting them
 2459 // to be subsumed into complex addressing expressions or compute them
 2460 // into registers?
 2461 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2462   Node *off = m->in(AddPNode::Offset);
 2463   if (off->is_Con()) {
 2464     address_visited.test_set(m->_idx); // Flag as address_visited
 2465     Node *adr = m->in(AddPNode::Address);
 2466 
 2467     // Intel can handle 2 adds in addressing mode
 2468     // AtomicAdd is not an addressing expression.
 2469     // Cheap to find it by looking for screwy base.
 2470     if (adr->is_AddP() &&
 2471         !adr->in(AddPNode::Base)->is_top() &&
 2472         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2473         // Are there other uses besides address expressions?
 2474         !is_visited(adr)) {
 2475       address_visited.set(adr->_idx); // Flag as address_visited
 2476       Node *shift = adr->in(AddPNode::Offset);
 2477       if (!clone_shift(shift, this, mstack, address_visited)) {
 2478         mstack.push(shift, Pre_Visit);
 2479       }
 2480       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2481       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2482     } else {
 2483       mstack.push(adr, Pre_Visit);
 2484     }
 2485 
 2486     // Clone X+offset as it also folds into most addressing expressions
 2487     mstack.push(off, Visit);
 2488     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2489     return true;
 2490   } else if (clone_shift(off, this, mstack, address_visited)) {
 2491     address_visited.test_set(m->_idx); // Flag as address_visited
 2492     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2493     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2494     return true;
 2495   }
 2496   return false;
 2497 }
 2498 
 2499 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2500   switch (bt) {
 2501     case BoolTest::eq:
 2502       return Assembler::eq;
 2503     case BoolTest::ne:
 2504       return Assembler::neq;
 2505     case BoolTest::le:
 2506     case BoolTest::ule:
 2507       return Assembler::le;
 2508     case BoolTest::ge:
 2509     case BoolTest::uge:
 2510       return Assembler::nlt;
 2511     case BoolTest::lt:
 2512     case BoolTest::ult:
 2513       return Assembler::lt;
 2514     case BoolTest::gt:
 2515     case BoolTest::ugt:
 2516       return Assembler::nle;
 2517     default : ShouldNotReachHere(); return Assembler::_false;
 2518   }
 2519 }
 2520 
 2521 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2522   switch (bt) {
 2523   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2524   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2525   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2526   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2527   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2528   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2529   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2530   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2531   }
 2532 }
 2533 
 2534 // Helper methods for MachSpillCopyNode::implementation().
 2535 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2536                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2537   assert(ireg == Op_VecS || // 32bit vector
 2538          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2539          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2540          "no non-adjacent vector moves" );
 2541   if (cbuf) {
 2542     C2_MacroAssembler _masm(cbuf);
 2543     switch (ireg) {
 2544     case Op_VecS: // copy whole register
 2545     case Op_VecD:
 2546     case Op_VecX:
 2547 #ifndef _LP64
 2548       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2549 #else
 2550       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2551         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2552       } else {
 2553         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2554      }
 2555 #endif
 2556       break;
 2557     case Op_VecY:
 2558 #ifndef _LP64
 2559       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2560 #else
 2561       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2562         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2563       } else {
 2564         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2565      }
 2566 #endif
 2567       break;
 2568     case Op_VecZ:
 2569       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2570       break;
 2571     default:
 2572       ShouldNotReachHere();
 2573     }
 2574 #ifndef PRODUCT
 2575   } else {
 2576     switch (ireg) {
 2577     case Op_VecS:
 2578     case Op_VecD:
 2579     case Op_VecX:
 2580       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2581       break;
 2582     case Op_VecY:
 2583     case Op_VecZ:
 2584       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2585       break;
 2586     default:
 2587       ShouldNotReachHere();
 2588     }
 2589 #endif
 2590   }
 2591 }
 2592 
 2593 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2594                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2595   if (cbuf) {
 2596     C2_MacroAssembler _masm(cbuf);
 2597     if (is_load) {
 2598       switch (ireg) {
 2599       case Op_VecS:
 2600         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2601         break;
 2602       case Op_VecD:
 2603         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2604         break;
 2605       case Op_VecX:
 2606 #ifndef _LP64
 2607         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2608 #else
 2609         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2610           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2611         } else {
 2612           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2613           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2614         }
 2615 #endif
 2616         break;
 2617       case Op_VecY:
 2618 #ifndef _LP64
 2619         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2620 #else
 2621         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2622           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2623         } else {
 2624           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2625           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2626         }
 2627 #endif
 2628         break;
 2629       case Op_VecZ:
 2630         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2631         break;
 2632       default:
 2633         ShouldNotReachHere();
 2634       }
 2635     } else { // store
 2636       switch (ireg) {
 2637       case Op_VecS:
 2638         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2639         break;
 2640       case Op_VecD:
 2641         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2642         break;
 2643       case Op_VecX:
 2644 #ifndef _LP64
 2645         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2646 #else
 2647         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2648           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2649         }
 2650         else {
 2651           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2652         }
 2653 #endif
 2654         break;
 2655       case Op_VecY:
 2656 #ifndef _LP64
 2657         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2658 #else
 2659         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2660           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2661         }
 2662         else {
 2663           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2664         }
 2665 #endif
 2666         break;
 2667       case Op_VecZ:
 2668         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2669         break;
 2670       default:
 2671         ShouldNotReachHere();
 2672       }
 2673     }
 2674 #ifndef PRODUCT
 2675   } else {
 2676     if (is_load) {
 2677       switch (ireg) {
 2678       case Op_VecS:
 2679         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2680         break;
 2681       case Op_VecD:
 2682         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2683         break;
 2684        case Op_VecX:
 2685         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2686         break;
 2687       case Op_VecY:
 2688       case Op_VecZ:
 2689         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2690         break;
 2691       default:
 2692         ShouldNotReachHere();
 2693       }
 2694     } else { // store
 2695       switch (ireg) {
 2696       case Op_VecS:
 2697         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2698         break;
 2699       case Op_VecD:
 2700         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2701         break;
 2702        case Op_VecX:
 2703         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2704         break;
 2705       case Op_VecY:
 2706       case Op_VecZ:
 2707         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2708         break;
 2709       default:
 2710         ShouldNotReachHere();
 2711       }
 2712     }
 2713 #endif
 2714   }
 2715 }
 2716 
 2717 template <class T>
 2718 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2719   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2720   jvalue ele;
 2721   switch (bt) {
 2722     case T_BYTE:   ele.b = con; break;
 2723     case T_SHORT:  ele.s = con; break;
 2724     case T_INT:    ele.i = con; break;
 2725     case T_LONG:   ele.j = con; break;
 2726     case T_FLOAT:  ele.f = con; break;
 2727     case T_DOUBLE: ele.d = con; break;
 2728     default: ShouldNotReachHere();
 2729   }
 2730   for (int i = 0; i < len; i++) {
 2731     val->append(ele);
 2732   }
 2733   return val;
 2734 }
 2735 
 2736 static inline jlong high_bit_set(BasicType bt) {
 2737   switch (bt) {
 2738     case T_BYTE:  return 0x8080808080808080;
 2739     case T_SHORT: return 0x8000800080008000;
 2740     case T_INT:   return 0x8000000080000000;
 2741     case T_LONG:  return 0x8000000000000000;
 2742     default:
 2743       ShouldNotReachHere();
 2744       return 0;
 2745   }
 2746 }
 2747 
 2748 #ifndef PRODUCT
 2749   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2750     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2751   }
 2752 #endif
 2753 
 2754   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2755     C2_MacroAssembler _masm(&cbuf);
 2756     __ nop(_count);
 2757   }
 2758 
 2759   uint MachNopNode::size(PhaseRegAlloc*) const {
 2760     return _count;
 2761   }
 2762 
 2763 #ifndef PRODUCT
 2764   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2765     st->print("# breakpoint");
 2766   }
 2767 #endif
 2768 
 2769   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2770     C2_MacroAssembler _masm(&cbuf);
 2771     __ int3();
 2772   }
 2773 
 2774   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2775     return MachNode::size(ra_);
 2776   }
 2777 
 2778 %}
 2779 
 2780 encode %{
 2781 
 2782   enc_class call_epilog %{
 2783     C2_MacroAssembler _masm(&cbuf);
 2784     if (VerifyStackAtCalls) {
 2785       // Check that stack depth is unchanged: find majik cookie on stack
 2786       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2787       Label L;
 2788       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2789       __ jccb(Assembler::equal, L);
 2790       // Die if stack mismatch
 2791       __ int3();
 2792       __ bind(L);
 2793     }
 2794     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2795       C2_MacroAssembler _masm(&cbuf);
 2796       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2797       // Search for the corresponding projection, get the register and emit code that initialized it.
 2798       uint con = (tf()->range_cc()->cnt() - 1);
 2799       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2800         ProjNode* proj = fast_out(i)->as_Proj();
 2801         if (proj->_con == con) {
 2802           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2803           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2804           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2805           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2806           __ testq(rax, rax);
 2807           __ setb(Assembler::notZero, toReg);
 2808           __ movzbl(toReg, toReg);
 2809           if (reg->is_stack()) {
 2810             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2811             __ movq(Address(rsp, st_off), toReg);
 2812           }
 2813           break;
 2814         }
 2815       }
 2816       if (return_value_is_used()) {
 2817         // An inline type is returned as fields in multiple registers.
 2818         // Rax either contains an oop if the inline type is buffered or a pointer
 2819         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2820         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2821         // rax &= (rax & 1) - 1
 2822         __ movptr(rscratch1, rax);
 2823         __ andptr(rscratch1, 0x1);
 2824         __ subptr(rscratch1, 0x1);
 2825         __ andptr(rax, rscratch1);
 2826       }
 2827     }
 2828   %}
 2829 
 2830 %}
 2831 
 2832 // Operands for bound floating pointer register arguments
 2833 operand rxmm0() %{
 2834   constraint(ALLOC_IN_RC(xmm0_reg));
 2835   match(VecX);
 2836   format%{%}
 2837   interface(REG_INTER);
 2838 %}
 2839 
 2840 //----------OPERANDS-----------------------------------------------------------
 2841 // Operand definitions must precede instruction definitions for correct parsing
 2842 // in the ADLC because operands constitute user defined types which are used in
 2843 // instruction definitions.
 2844 
 2845 // Vectors
 2846 
 2847 // Dummy generic vector class. Should be used for all vector operands.
 2848 // Replaced with vec[SDXYZ] during post-selection pass.
 2849 operand vec() %{
 2850   constraint(ALLOC_IN_RC(dynamic));
 2851   match(VecX);
 2852   match(VecY);
 2853   match(VecZ);
 2854   match(VecS);
 2855   match(VecD);
 2856 
 2857   format %{ %}
 2858   interface(REG_INTER);
 2859 %}
 2860 
 2861 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2862 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2863 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2864 // runtime code generation via reg_class_dynamic.
 2865 operand legVec() %{
 2866   constraint(ALLOC_IN_RC(dynamic));
 2867   match(VecX);
 2868   match(VecY);
 2869   match(VecZ);
 2870   match(VecS);
 2871   match(VecD);
 2872 
 2873   format %{ %}
 2874   interface(REG_INTER);
 2875 %}
 2876 
 2877 // Replaces vec during post-selection cleanup. See above.
 2878 operand vecS() %{
 2879   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2880   match(VecS);
 2881 
 2882   format %{ %}
 2883   interface(REG_INTER);
 2884 %}
 2885 
 2886 // Replaces legVec during post-selection cleanup. See above.
 2887 operand legVecS() %{
 2888   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2889   match(VecS);
 2890 
 2891   format %{ %}
 2892   interface(REG_INTER);
 2893 %}
 2894 
 2895 // Replaces vec during post-selection cleanup. See above.
 2896 operand vecD() %{
 2897   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2898   match(VecD);
 2899 
 2900   format %{ %}
 2901   interface(REG_INTER);
 2902 %}
 2903 
 2904 // Replaces legVec during post-selection cleanup. See above.
 2905 operand legVecD() %{
 2906   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2907   match(VecD);
 2908 
 2909   format %{ %}
 2910   interface(REG_INTER);
 2911 %}
 2912 
 2913 // Replaces vec during post-selection cleanup. See above.
 2914 operand vecX() %{
 2915   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2916   match(VecX);
 2917 
 2918   format %{ %}
 2919   interface(REG_INTER);
 2920 %}
 2921 
 2922 // Replaces legVec during post-selection cleanup. See above.
 2923 operand legVecX() %{
 2924   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2925   match(VecX);
 2926 
 2927   format %{ %}
 2928   interface(REG_INTER);
 2929 %}
 2930 
 2931 // Replaces vec during post-selection cleanup. See above.
 2932 operand vecY() %{
 2933   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2934   match(VecY);
 2935 
 2936   format %{ %}
 2937   interface(REG_INTER);
 2938 %}
 2939 
 2940 // Replaces legVec during post-selection cleanup. See above.
 2941 operand legVecY() %{
 2942   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2943   match(VecY);
 2944 
 2945   format %{ %}
 2946   interface(REG_INTER);
 2947 %}
 2948 
 2949 // Replaces vec during post-selection cleanup. See above.
 2950 operand vecZ() %{
 2951   constraint(ALLOC_IN_RC(vectorz_reg));
 2952   match(VecZ);
 2953 
 2954   format %{ %}
 2955   interface(REG_INTER);
 2956 %}
 2957 
 2958 // Replaces legVec during post-selection cleanup. See above.
 2959 operand legVecZ() %{
 2960   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2961   match(VecZ);
 2962 
 2963   format %{ %}
 2964   interface(REG_INTER);
 2965 %}
 2966 
 2967 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2968 
 2969 // ============================================================================
 2970 
 2971 instruct ShouldNotReachHere() %{
 2972   match(Halt);
 2973   format %{ "stop\t# ShouldNotReachHere" %}
 2974   ins_encode %{
 2975     if (is_reachable()) {
 2976       __ stop(_halt_reason);
 2977     }
 2978   %}
 2979   ins_pipe(pipe_slow);
 2980 %}
 2981 
 2982 // ============================================================================
 2983 
 2984 instruct addF_reg(regF dst, regF src) %{
 2985   predicate((UseSSE>=1) && (UseAVX == 0));
 2986   match(Set dst (AddF dst src));
 2987 
 2988   format %{ "addss   $dst, $src" %}
 2989   ins_cost(150);
 2990   ins_encode %{
 2991     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2992   %}
 2993   ins_pipe(pipe_slow);
 2994 %}
 2995 
 2996 instruct addF_mem(regF dst, memory src) %{
 2997   predicate((UseSSE>=1) && (UseAVX == 0));
 2998   match(Set dst (AddF dst (LoadF src)));
 2999 
 3000   format %{ "addss   $dst, $src" %}
 3001   ins_cost(150);
 3002   ins_encode %{
 3003     __ addss($dst$$XMMRegister, $src$$Address);
 3004   %}
 3005   ins_pipe(pipe_slow);
 3006 %}
 3007 
 3008 instruct addF_imm(regF dst, immF con) %{
 3009   predicate((UseSSE>=1) && (UseAVX == 0));
 3010   match(Set dst (AddF dst con));
 3011   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3012   ins_cost(150);
 3013   ins_encode %{
 3014     __ addss($dst$$XMMRegister, $constantaddress($con));
 3015   %}
 3016   ins_pipe(pipe_slow);
 3017 %}
 3018 
 3019 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3020   predicate(UseAVX > 0);
 3021   match(Set dst (AddF src1 src2));
 3022 
 3023   format %{ "vaddss  $dst, $src1, $src2" %}
 3024   ins_cost(150);
 3025   ins_encode %{
 3026     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3027   %}
 3028   ins_pipe(pipe_slow);
 3029 %}
 3030 
 3031 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3032   predicate(UseAVX > 0);
 3033   match(Set dst (AddF src1 (LoadF src2)));
 3034 
 3035   format %{ "vaddss  $dst, $src1, $src2" %}
 3036   ins_cost(150);
 3037   ins_encode %{
 3038     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3039   %}
 3040   ins_pipe(pipe_slow);
 3041 %}
 3042 
 3043 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3044   predicate(UseAVX > 0);
 3045   match(Set dst (AddF src con));
 3046 
 3047   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3048   ins_cost(150);
 3049   ins_encode %{
 3050     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3051   %}
 3052   ins_pipe(pipe_slow);
 3053 %}
 3054 
 3055 instruct addD_reg(regD dst, regD src) %{
 3056   predicate((UseSSE>=2) && (UseAVX == 0));
 3057   match(Set dst (AddD dst src));
 3058 
 3059   format %{ "addsd   $dst, $src" %}
 3060   ins_cost(150);
 3061   ins_encode %{
 3062     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3063   %}
 3064   ins_pipe(pipe_slow);
 3065 %}
 3066 
 3067 instruct addD_mem(regD dst, memory src) %{
 3068   predicate((UseSSE>=2) && (UseAVX == 0));
 3069   match(Set dst (AddD dst (LoadD src)));
 3070 
 3071   format %{ "addsd   $dst, $src" %}
 3072   ins_cost(150);
 3073   ins_encode %{
 3074     __ addsd($dst$$XMMRegister, $src$$Address);
 3075   %}
 3076   ins_pipe(pipe_slow);
 3077 %}
 3078 
 3079 instruct addD_imm(regD dst, immD con) %{
 3080   predicate((UseSSE>=2) && (UseAVX == 0));
 3081   match(Set dst (AddD dst con));
 3082   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3083   ins_cost(150);
 3084   ins_encode %{
 3085     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3086   %}
 3087   ins_pipe(pipe_slow);
 3088 %}
 3089 
 3090 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3091   predicate(UseAVX > 0);
 3092   match(Set dst (AddD src1 src2));
 3093 
 3094   format %{ "vaddsd  $dst, $src1, $src2" %}
 3095   ins_cost(150);
 3096   ins_encode %{
 3097     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3098   %}
 3099   ins_pipe(pipe_slow);
 3100 %}
 3101 
 3102 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3103   predicate(UseAVX > 0);
 3104   match(Set dst (AddD src1 (LoadD src2)));
 3105 
 3106   format %{ "vaddsd  $dst, $src1, $src2" %}
 3107   ins_cost(150);
 3108   ins_encode %{
 3109     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3110   %}
 3111   ins_pipe(pipe_slow);
 3112 %}
 3113 
 3114 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3115   predicate(UseAVX > 0);
 3116   match(Set dst (AddD src con));
 3117 
 3118   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3119   ins_cost(150);
 3120   ins_encode %{
 3121     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3122   %}
 3123   ins_pipe(pipe_slow);
 3124 %}
 3125 
 3126 instruct subF_reg(regF dst, regF src) %{
 3127   predicate((UseSSE>=1) && (UseAVX == 0));
 3128   match(Set dst (SubF dst src));
 3129 
 3130   format %{ "subss   $dst, $src" %}
 3131   ins_cost(150);
 3132   ins_encode %{
 3133     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3134   %}
 3135   ins_pipe(pipe_slow);
 3136 %}
 3137 
 3138 instruct subF_mem(regF dst, memory src) %{
 3139   predicate((UseSSE>=1) && (UseAVX == 0));
 3140   match(Set dst (SubF dst (LoadF src)));
 3141 
 3142   format %{ "subss   $dst, $src" %}
 3143   ins_cost(150);
 3144   ins_encode %{
 3145     __ subss($dst$$XMMRegister, $src$$Address);
 3146   %}
 3147   ins_pipe(pipe_slow);
 3148 %}
 3149 
 3150 instruct subF_imm(regF dst, immF con) %{
 3151   predicate((UseSSE>=1) && (UseAVX == 0));
 3152   match(Set dst (SubF dst con));
 3153   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3154   ins_cost(150);
 3155   ins_encode %{
 3156     __ subss($dst$$XMMRegister, $constantaddress($con));
 3157   %}
 3158   ins_pipe(pipe_slow);
 3159 %}
 3160 
 3161 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3162   predicate(UseAVX > 0);
 3163   match(Set dst (SubF src1 src2));
 3164 
 3165   format %{ "vsubss  $dst, $src1, $src2" %}
 3166   ins_cost(150);
 3167   ins_encode %{
 3168     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3169   %}
 3170   ins_pipe(pipe_slow);
 3171 %}
 3172 
 3173 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3174   predicate(UseAVX > 0);
 3175   match(Set dst (SubF src1 (LoadF src2)));
 3176 
 3177   format %{ "vsubss  $dst, $src1, $src2" %}
 3178   ins_cost(150);
 3179   ins_encode %{
 3180     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3181   %}
 3182   ins_pipe(pipe_slow);
 3183 %}
 3184 
 3185 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3186   predicate(UseAVX > 0);
 3187   match(Set dst (SubF src con));
 3188 
 3189   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3190   ins_cost(150);
 3191   ins_encode %{
 3192     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3193   %}
 3194   ins_pipe(pipe_slow);
 3195 %}
 3196 
 3197 instruct subD_reg(regD dst, regD src) %{
 3198   predicate((UseSSE>=2) && (UseAVX == 0));
 3199   match(Set dst (SubD dst src));
 3200 
 3201   format %{ "subsd   $dst, $src" %}
 3202   ins_cost(150);
 3203   ins_encode %{
 3204     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3205   %}
 3206   ins_pipe(pipe_slow);
 3207 %}
 3208 
 3209 instruct subD_mem(regD dst, memory src) %{
 3210   predicate((UseSSE>=2) && (UseAVX == 0));
 3211   match(Set dst (SubD dst (LoadD src)));
 3212 
 3213   format %{ "subsd   $dst, $src" %}
 3214   ins_cost(150);
 3215   ins_encode %{
 3216     __ subsd($dst$$XMMRegister, $src$$Address);
 3217   %}
 3218   ins_pipe(pipe_slow);
 3219 %}
 3220 
 3221 instruct subD_imm(regD dst, immD con) %{
 3222   predicate((UseSSE>=2) && (UseAVX == 0));
 3223   match(Set dst (SubD dst con));
 3224   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3225   ins_cost(150);
 3226   ins_encode %{
 3227     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3228   %}
 3229   ins_pipe(pipe_slow);
 3230 %}
 3231 
 3232 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3233   predicate(UseAVX > 0);
 3234   match(Set dst (SubD src1 src2));
 3235 
 3236   format %{ "vsubsd  $dst, $src1, $src2" %}
 3237   ins_cost(150);
 3238   ins_encode %{
 3239     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3240   %}
 3241   ins_pipe(pipe_slow);
 3242 %}
 3243 
 3244 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3245   predicate(UseAVX > 0);
 3246   match(Set dst (SubD src1 (LoadD src2)));
 3247 
 3248   format %{ "vsubsd  $dst, $src1, $src2" %}
 3249   ins_cost(150);
 3250   ins_encode %{
 3251     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3252   %}
 3253   ins_pipe(pipe_slow);
 3254 %}
 3255 
 3256 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3257   predicate(UseAVX > 0);
 3258   match(Set dst (SubD src con));
 3259 
 3260   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3261   ins_cost(150);
 3262   ins_encode %{
 3263     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3264   %}
 3265   ins_pipe(pipe_slow);
 3266 %}
 3267 
 3268 instruct mulF_reg(regF dst, regF src) %{
 3269   predicate((UseSSE>=1) && (UseAVX == 0));
 3270   match(Set dst (MulF dst src));
 3271 
 3272   format %{ "mulss   $dst, $src" %}
 3273   ins_cost(150);
 3274   ins_encode %{
 3275     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3276   %}
 3277   ins_pipe(pipe_slow);
 3278 %}
 3279 
 3280 instruct mulF_mem(regF dst, memory src) %{
 3281   predicate((UseSSE>=1) && (UseAVX == 0));
 3282   match(Set dst (MulF dst (LoadF src)));
 3283 
 3284   format %{ "mulss   $dst, $src" %}
 3285   ins_cost(150);
 3286   ins_encode %{
 3287     __ mulss($dst$$XMMRegister, $src$$Address);
 3288   %}
 3289   ins_pipe(pipe_slow);
 3290 %}
 3291 
 3292 instruct mulF_imm(regF dst, immF con) %{
 3293   predicate((UseSSE>=1) && (UseAVX == 0));
 3294   match(Set dst (MulF dst con));
 3295   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3296   ins_cost(150);
 3297   ins_encode %{
 3298     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3299   %}
 3300   ins_pipe(pipe_slow);
 3301 %}
 3302 
 3303 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3304   predicate(UseAVX > 0);
 3305   match(Set dst (MulF src1 src2));
 3306 
 3307   format %{ "vmulss  $dst, $src1, $src2" %}
 3308   ins_cost(150);
 3309   ins_encode %{
 3310     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3311   %}
 3312   ins_pipe(pipe_slow);
 3313 %}
 3314 
 3315 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3316   predicate(UseAVX > 0);
 3317   match(Set dst (MulF src1 (LoadF src2)));
 3318 
 3319   format %{ "vmulss  $dst, $src1, $src2" %}
 3320   ins_cost(150);
 3321   ins_encode %{
 3322     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3323   %}
 3324   ins_pipe(pipe_slow);
 3325 %}
 3326 
 3327 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3328   predicate(UseAVX > 0);
 3329   match(Set dst (MulF src con));
 3330 
 3331   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3332   ins_cost(150);
 3333   ins_encode %{
 3334     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3335   %}
 3336   ins_pipe(pipe_slow);
 3337 %}
 3338 
 3339 instruct mulD_reg(regD dst, regD src) %{
 3340   predicate((UseSSE>=2) && (UseAVX == 0));
 3341   match(Set dst (MulD dst src));
 3342 
 3343   format %{ "mulsd   $dst, $src" %}
 3344   ins_cost(150);
 3345   ins_encode %{
 3346     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3347   %}
 3348   ins_pipe(pipe_slow);
 3349 %}
 3350 
 3351 instruct mulD_mem(regD dst, memory src) %{
 3352   predicate((UseSSE>=2) && (UseAVX == 0));
 3353   match(Set dst (MulD dst (LoadD src)));
 3354 
 3355   format %{ "mulsd   $dst, $src" %}
 3356   ins_cost(150);
 3357   ins_encode %{
 3358     __ mulsd($dst$$XMMRegister, $src$$Address);
 3359   %}
 3360   ins_pipe(pipe_slow);
 3361 %}
 3362 
 3363 instruct mulD_imm(regD dst, immD con) %{
 3364   predicate((UseSSE>=2) && (UseAVX == 0));
 3365   match(Set dst (MulD dst con));
 3366   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3367   ins_cost(150);
 3368   ins_encode %{
 3369     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3370   %}
 3371   ins_pipe(pipe_slow);
 3372 %}
 3373 
 3374 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3375   predicate(UseAVX > 0);
 3376   match(Set dst (MulD src1 src2));
 3377 
 3378   format %{ "vmulsd  $dst, $src1, $src2" %}
 3379   ins_cost(150);
 3380   ins_encode %{
 3381     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3382   %}
 3383   ins_pipe(pipe_slow);
 3384 %}
 3385 
 3386 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3387   predicate(UseAVX > 0);
 3388   match(Set dst (MulD src1 (LoadD src2)));
 3389 
 3390   format %{ "vmulsd  $dst, $src1, $src2" %}
 3391   ins_cost(150);
 3392   ins_encode %{
 3393     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3394   %}
 3395   ins_pipe(pipe_slow);
 3396 %}
 3397 
 3398 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3399   predicate(UseAVX > 0);
 3400   match(Set dst (MulD src con));
 3401 
 3402   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3403   ins_cost(150);
 3404   ins_encode %{
 3405     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3406   %}
 3407   ins_pipe(pipe_slow);
 3408 %}
 3409 
 3410 instruct divF_reg(regF dst, regF src) %{
 3411   predicate((UseSSE>=1) && (UseAVX == 0));
 3412   match(Set dst (DivF dst src));
 3413 
 3414   format %{ "divss   $dst, $src" %}
 3415   ins_cost(150);
 3416   ins_encode %{
 3417     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3418   %}
 3419   ins_pipe(pipe_slow);
 3420 %}
 3421 
 3422 instruct divF_mem(regF dst, memory src) %{
 3423   predicate((UseSSE>=1) && (UseAVX == 0));
 3424   match(Set dst (DivF dst (LoadF src)));
 3425 
 3426   format %{ "divss   $dst, $src" %}
 3427   ins_cost(150);
 3428   ins_encode %{
 3429     __ divss($dst$$XMMRegister, $src$$Address);
 3430   %}
 3431   ins_pipe(pipe_slow);
 3432 %}
 3433 
 3434 instruct divF_imm(regF dst, immF con) %{
 3435   predicate((UseSSE>=1) && (UseAVX == 0));
 3436   match(Set dst (DivF dst con));
 3437   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3438   ins_cost(150);
 3439   ins_encode %{
 3440     __ divss($dst$$XMMRegister, $constantaddress($con));
 3441   %}
 3442   ins_pipe(pipe_slow);
 3443 %}
 3444 
 3445 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3446   predicate(UseAVX > 0);
 3447   match(Set dst (DivF src1 src2));
 3448 
 3449   format %{ "vdivss  $dst, $src1, $src2" %}
 3450   ins_cost(150);
 3451   ins_encode %{
 3452     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3453   %}
 3454   ins_pipe(pipe_slow);
 3455 %}
 3456 
 3457 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3458   predicate(UseAVX > 0);
 3459   match(Set dst (DivF src1 (LoadF src2)));
 3460 
 3461   format %{ "vdivss  $dst, $src1, $src2" %}
 3462   ins_cost(150);
 3463   ins_encode %{
 3464     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3465   %}
 3466   ins_pipe(pipe_slow);
 3467 %}
 3468 
 3469 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3470   predicate(UseAVX > 0);
 3471   match(Set dst (DivF src con));
 3472 
 3473   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3474   ins_cost(150);
 3475   ins_encode %{
 3476     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3477   %}
 3478   ins_pipe(pipe_slow);
 3479 %}
 3480 
 3481 instruct divD_reg(regD dst, regD src) %{
 3482   predicate((UseSSE>=2) && (UseAVX == 0));
 3483   match(Set dst (DivD dst src));
 3484 
 3485   format %{ "divsd   $dst, $src" %}
 3486   ins_cost(150);
 3487   ins_encode %{
 3488     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3489   %}
 3490   ins_pipe(pipe_slow);
 3491 %}
 3492 
 3493 instruct divD_mem(regD dst, memory src) %{
 3494   predicate((UseSSE>=2) && (UseAVX == 0));
 3495   match(Set dst (DivD dst (LoadD src)));
 3496 
 3497   format %{ "divsd   $dst, $src" %}
 3498   ins_cost(150);
 3499   ins_encode %{
 3500     __ divsd($dst$$XMMRegister, $src$$Address);
 3501   %}
 3502   ins_pipe(pipe_slow);
 3503 %}
 3504 
 3505 instruct divD_imm(regD dst, immD con) %{
 3506   predicate((UseSSE>=2) && (UseAVX == 0));
 3507   match(Set dst (DivD dst con));
 3508   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3509   ins_cost(150);
 3510   ins_encode %{
 3511     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3512   %}
 3513   ins_pipe(pipe_slow);
 3514 %}
 3515 
 3516 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3517   predicate(UseAVX > 0);
 3518   match(Set dst (DivD src1 src2));
 3519 
 3520   format %{ "vdivsd  $dst, $src1, $src2" %}
 3521   ins_cost(150);
 3522   ins_encode %{
 3523     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3524   %}
 3525   ins_pipe(pipe_slow);
 3526 %}
 3527 
 3528 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3529   predicate(UseAVX > 0);
 3530   match(Set dst (DivD src1 (LoadD src2)));
 3531 
 3532   format %{ "vdivsd  $dst, $src1, $src2" %}
 3533   ins_cost(150);
 3534   ins_encode %{
 3535     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3536   %}
 3537   ins_pipe(pipe_slow);
 3538 %}
 3539 
 3540 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3541   predicate(UseAVX > 0);
 3542   match(Set dst (DivD src con));
 3543 
 3544   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3545   ins_cost(150);
 3546   ins_encode %{
 3547     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3548   %}
 3549   ins_pipe(pipe_slow);
 3550 %}
 3551 
 3552 instruct absF_reg(regF dst) %{
 3553   predicate((UseSSE>=1) && (UseAVX == 0));
 3554   match(Set dst (AbsF dst));
 3555   ins_cost(150);
 3556   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3557   ins_encode %{
 3558     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3559   %}
 3560   ins_pipe(pipe_slow);
 3561 %}
 3562 
 3563 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3564   predicate(UseAVX > 0);
 3565   match(Set dst (AbsF src));
 3566   ins_cost(150);
 3567   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3568   ins_encode %{
 3569     int vlen_enc = Assembler::AVX_128bit;
 3570     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3571               ExternalAddress(float_signmask()), vlen_enc);
 3572   %}
 3573   ins_pipe(pipe_slow);
 3574 %}
 3575 
 3576 instruct absD_reg(regD dst) %{
 3577   predicate((UseSSE>=2) && (UseAVX == 0));
 3578   match(Set dst (AbsD dst));
 3579   ins_cost(150);
 3580   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3581             "# abs double by sign masking" %}
 3582   ins_encode %{
 3583     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3584   %}
 3585   ins_pipe(pipe_slow);
 3586 %}
 3587 
 3588 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3589   predicate(UseAVX > 0);
 3590   match(Set dst (AbsD src));
 3591   ins_cost(150);
 3592   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3593             "# abs double by sign masking" %}
 3594   ins_encode %{
 3595     int vlen_enc = Assembler::AVX_128bit;
 3596     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3597               ExternalAddress(double_signmask()), vlen_enc);
 3598   %}
 3599   ins_pipe(pipe_slow);
 3600 %}
 3601 
 3602 instruct negF_reg(regF dst) %{
 3603   predicate((UseSSE>=1) && (UseAVX == 0));
 3604   match(Set dst (NegF dst));
 3605   ins_cost(150);
 3606   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3607   ins_encode %{
 3608     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3609   %}
 3610   ins_pipe(pipe_slow);
 3611 %}
 3612 
 3613 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3614   predicate(UseAVX > 0);
 3615   match(Set dst (NegF src));
 3616   ins_cost(150);
 3617   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3618   ins_encode %{
 3619     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3620                  ExternalAddress(float_signflip()));
 3621   %}
 3622   ins_pipe(pipe_slow);
 3623 %}
 3624 
 3625 instruct negD_reg(regD dst) %{
 3626   predicate((UseSSE>=2) && (UseAVX == 0));
 3627   match(Set dst (NegD dst));
 3628   ins_cost(150);
 3629   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3630             "# neg double by sign flipping" %}
 3631   ins_encode %{
 3632     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3633   %}
 3634   ins_pipe(pipe_slow);
 3635 %}
 3636 
 3637 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3638   predicate(UseAVX > 0);
 3639   match(Set dst (NegD src));
 3640   ins_cost(150);
 3641   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3642             "# neg double by sign flipping" %}
 3643   ins_encode %{
 3644     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3645                  ExternalAddress(double_signflip()));
 3646   %}
 3647   ins_pipe(pipe_slow);
 3648 %}
 3649 
 3650 // sqrtss instruction needs destination register to be pre initialized for best performance
 3651 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3652 instruct sqrtF_reg(regF dst) %{
 3653   predicate(UseSSE>=1);
 3654   match(Set dst (SqrtF dst));
 3655   format %{ "sqrtss  $dst, $dst" %}
 3656   ins_encode %{
 3657     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3658   %}
 3659   ins_pipe(pipe_slow);
 3660 %}
 3661 
 3662 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3663 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3664 instruct sqrtD_reg(regD dst) %{
 3665   predicate(UseSSE>=2);
 3666   match(Set dst (SqrtD dst));
 3667   format %{ "sqrtsd  $dst, $dst" %}
 3668   ins_encode %{
 3669     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3670   %}
 3671   ins_pipe(pipe_slow);
 3672 %}
 3673 
 3674 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3675   effect(TEMP tmp);
 3676   match(Set dst (ConvF2HF src));
 3677   ins_cost(125);
 3678   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3679   ins_encode %{
 3680     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3681   %}
 3682   ins_pipe( pipe_slow );
 3683 %}
 3684 
 3685 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3686   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3687   effect(TEMP ktmp, TEMP rtmp);
 3688   match(Set mem (StoreC mem (ConvF2HF src)));
 3689   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3690   ins_encode %{
 3691     __ movl($rtmp$$Register, 0x1);
 3692     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3693     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3694   %}
 3695   ins_pipe( pipe_slow );
 3696 %}
 3697 
 3698 instruct vconvF2HF(vec dst, vec src) %{
 3699   match(Set dst (VectorCastF2HF src));
 3700   format %{ "vector_conv_F2HF $dst $src" %}
 3701   ins_encode %{
 3702     int vlen_enc = vector_length_encoding(this, $src);
 3703     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3704   %}
 3705   ins_pipe( pipe_slow );
 3706 %}
 3707 
 3708 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3709   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3710   format %{ "vcvtps2ph $mem,$src" %}
 3711   ins_encode %{
 3712     int vlen_enc = vector_length_encoding(this, $src);
 3713     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3714   %}
 3715   ins_pipe( pipe_slow );
 3716 %}
 3717 
 3718 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3719   match(Set dst (ConvHF2F src));
 3720   format %{ "vcvtph2ps $dst,$src" %}
 3721   ins_encode %{
 3722     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3723   %}
 3724   ins_pipe( pipe_slow );
 3725 %}
 3726 
 3727 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3728   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3729   format %{ "vcvtph2ps $dst,$mem" %}
 3730   ins_encode %{
 3731     int vlen_enc = vector_length_encoding(this);
 3732     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3733   %}
 3734   ins_pipe( pipe_slow );
 3735 %}
 3736 
 3737 instruct vconvHF2F(vec dst, vec src) %{
 3738   match(Set dst (VectorCastHF2F src));
 3739   ins_cost(125);
 3740   format %{ "vector_conv_HF2F $dst,$src" %}
 3741   ins_encode %{
 3742     int vlen_enc = vector_length_encoding(this);
 3743     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3744   %}
 3745   ins_pipe( pipe_slow );
 3746 %}
 3747 
 3748 // ---------------------------------------- VectorReinterpret ------------------------------------
 3749 instruct reinterpret_mask(kReg dst) %{
 3750   predicate(n->bottom_type()->isa_vectmask() &&
 3751             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3752   match(Set dst (VectorReinterpret dst));
 3753   ins_cost(125);
 3754   format %{ "vector_reinterpret $dst\t!" %}
 3755   ins_encode %{
 3756     // empty
 3757   %}
 3758   ins_pipe( pipe_slow );
 3759 %}
 3760 
 3761 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3762   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3763             n->bottom_type()->isa_vectmask() &&
 3764             n->in(1)->bottom_type()->isa_vectmask() &&
 3765             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3766             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3767   match(Set dst (VectorReinterpret src));
 3768   effect(TEMP xtmp);
 3769   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3770   ins_encode %{
 3771      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3772      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3773      assert(src_sz == dst_sz , "src and dst size mismatch");
 3774      int vlen_enc = vector_length_encoding(src_sz);
 3775      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3776      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3777   %}
 3778   ins_pipe( pipe_slow );
 3779 %}
 3780 
 3781 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3782   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3783             n->bottom_type()->isa_vectmask() &&
 3784             n->in(1)->bottom_type()->isa_vectmask() &&
 3785             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3786              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3787             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3788   match(Set dst (VectorReinterpret src));
 3789   effect(TEMP xtmp);
 3790   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3791   ins_encode %{
 3792      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3793      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3794      assert(src_sz == dst_sz , "src and dst size mismatch");
 3795      int vlen_enc = vector_length_encoding(src_sz);
 3796      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3797      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3798   %}
 3799   ins_pipe( pipe_slow );
 3800 %}
 3801 
 3802 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3803   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3804             n->bottom_type()->isa_vectmask() &&
 3805             n->in(1)->bottom_type()->isa_vectmask() &&
 3806             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3807              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3808             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3809   match(Set dst (VectorReinterpret src));
 3810   effect(TEMP xtmp);
 3811   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3812   ins_encode %{
 3813      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3814      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3815      assert(src_sz == dst_sz , "src and dst size mismatch");
 3816      int vlen_enc = vector_length_encoding(src_sz);
 3817      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3818      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3819   %}
 3820   ins_pipe( pipe_slow );
 3821 %}
 3822 
 3823 instruct reinterpret(vec dst) %{
 3824   predicate(!n->bottom_type()->isa_vectmask() &&
 3825             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3826   match(Set dst (VectorReinterpret dst));
 3827   ins_cost(125);
 3828   format %{ "vector_reinterpret $dst\t!" %}
 3829   ins_encode %{
 3830     // empty
 3831   %}
 3832   ins_pipe( pipe_slow );
 3833 %}
 3834 
 3835 instruct reinterpret_expand(vec dst, vec src) %{
 3836   predicate(UseAVX == 0 &&
 3837             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3838   match(Set dst (VectorReinterpret src));
 3839   ins_cost(125);
 3840   effect(TEMP dst);
 3841   format %{ "vector_reinterpret_expand $dst,$src" %}
 3842   ins_encode %{
 3843     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3844     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3845 
 3846     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3847     if (src_vlen_in_bytes == 4) {
 3848       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3849     } else {
 3850       assert(src_vlen_in_bytes == 8, "");
 3851       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3852     }
 3853     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3854   %}
 3855   ins_pipe( pipe_slow );
 3856 %}
 3857 
 3858 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3859   predicate(UseAVX > 0 &&
 3860             !n->bottom_type()->isa_vectmask() &&
 3861             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3862             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3863   match(Set dst (VectorReinterpret src));
 3864   ins_cost(125);
 3865   format %{ "vector_reinterpret_expand $dst,$src" %}
 3866   ins_encode %{
 3867     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3868   %}
 3869   ins_pipe( pipe_slow );
 3870 %}
 3871 
 3872 
 3873 instruct vreinterpret_expand(legVec dst, vec src) %{
 3874   predicate(UseAVX > 0 &&
 3875             !n->bottom_type()->isa_vectmask() &&
 3876             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3877             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3878   match(Set dst (VectorReinterpret src));
 3879   ins_cost(125);
 3880   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3881   ins_encode %{
 3882     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3883       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3884       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3885       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3886       default: ShouldNotReachHere();
 3887     }
 3888   %}
 3889   ins_pipe( pipe_slow );
 3890 %}
 3891 
 3892 instruct reinterpret_shrink(vec dst, legVec src) %{
 3893   predicate(!n->bottom_type()->isa_vectmask() &&
 3894             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3895   match(Set dst (VectorReinterpret src));
 3896   ins_cost(125);
 3897   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3898   ins_encode %{
 3899     switch (Matcher::vector_length_in_bytes(this)) {
 3900       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3901       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3902       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3903       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3904       default: ShouldNotReachHere();
 3905     }
 3906   %}
 3907   ins_pipe( pipe_slow );
 3908 %}
 3909 
 3910 // ----------------------------------------------------------------------------------------------------
 3911 
 3912 #ifdef _LP64
 3913 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3914   match(Set dst (RoundDoubleMode src rmode));
 3915   format %{ "roundsd $dst,$src" %}
 3916   ins_cost(150);
 3917   ins_encode %{
 3918     assert(UseSSE >= 4, "required");
 3919     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3920   %}
 3921   ins_pipe(pipe_slow);
 3922 %}
 3923 
 3924 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3925   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3926   format %{ "roundsd $dst,$src" %}
 3927   ins_cost(150);
 3928   ins_encode %{
 3929     assert(UseSSE >= 4, "required");
 3930     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3931   %}
 3932   ins_pipe(pipe_slow);
 3933 %}
 3934 
 3935 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3936   match(Set dst (RoundDoubleMode con rmode));
 3937   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3938   ins_cost(150);
 3939   ins_encode %{
 3940     assert(UseSSE >= 4, "required");
 3941     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3942   %}
 3943   ins_pipe(pipe_slow);
 3944 %}
 3945 
 3946 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3947   predicate(Matcher::vector_length(n) < 8);
 3948   match(Set dst (RoundDoubleModeV src rmode));
 3949   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3950   ins_encode %{
 3951     assert(UseAVX > 0, "required");
 3952     int vlen_enc = vector_length_encoding(this);
 3953     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3954   %}
 3955   ins_pipe( pipe_slow );
 3956 %}
 3957 
 3958 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3959   predicate(Matcher::vector_length(n) == 8);
 3960   match(Set dst (RoundDoubleModeV src rmode));
 3961   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3962   ins_encode %{
 3963     assert(UseAVX > 2, "required");
 3964     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3965   %}
 3966   ins_pipe( pipe_slow );
 3967 %}
 3968 
 3969 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3970   predicate(Matcher::vector_length(n) < 8);
 3971   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3972   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3973   ins_encode %{
 3974     assert(UseAVX > 0, "required");
 3975     int vlen_enc = vector_length_encoding(this);
 3976     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3977   %}
 3978   ins_pipe( pipe_slow );
 3979 %}
 3980 
 3981 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3982   predicate(Matcher::vector_length(n) == 8);
 3983   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3984   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3985   ins_encode %{
 3986     assert(UseAVX > 2, "required");
 3987     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3988   %}
 3989   ins_pipe( pipe_slow );
 3990 %}
 3991 #endif // _LP64
 3992 
 3993 instruct onspinwait() %{
 3994   match(OnSpinWait);
 3995   ins_cost(200);
 3996 
 3997   format %{
 3998     $$template
 3999     $$emit$$"pause\t! membar_onspinwait"
 4000   %}
 4001   ins_encode %{
 4002     __ pause();
 4003   %}
 4004   ins_pipe(pipe_slow);
 4005 %}
 4006 
 4007 // a * b + c
 4008 instruct fmaD_reg(regD a, regD b, regD c) %{
 4009   match(Set c (FmaD  c (Binary a b)));
 4010   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4011   ins_cost(150);
 4012   ins_encode %{
 4013     assert(UseFMA, "Needs FMA instructions support.");
 4014     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4015   %}
 4016   ins_pipe( pipe_slow );
 4017 %}
 4018 
 4019 // a * b + c
 4020 instruct fmaF_reg(regF a, regF b, regF c) %{
 4021   match(Set c (FmaF  c (Binary a b)));
 4022   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4023   ins_cost(150);
 4024   ins_encode %{
 4025     assert(UseFMA, "Needs FMA instructions support.");
 4026     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4027   %}
 4028   ins_pipe( pipe_slow );
 4029 %}
 4030 
 4031 // ====================VECTOR INSTRUCTIONS=====================================
 4032 
 4033 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4034 instruct MoveVec2Leg(legVec dst, vec src) %{
 4035   match(Set dst src);
 4036   format %{ "" %}
 4037   ins_encode %{
 4038     ShouldNotReachHere();
 4039   %}
 4040   ins_pipe( fpu_reg_reg );
 4041 %}
 4042 
 4043 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4044   match(Set dst src);
 4045   format %{ "" %}
 4046   ins_encode %{
 4047     ShouldNotReachHere();
 4048   %}
 4049   ins_pipe( fpu_reg_reg );
 4050 %}
 4051 
 4052 // ============================================================================
 4053 
 4054 // Load vectors generic operand pattern
 4055 instruct loadV(vec dst, memory mem) %{
 4056   match(Set dst (LoadVector mem));
 4057   ins_cost(125);
 4058   format %{ "load_vector $dst,$mem" %}
 4059   ins_encode %{
 4060     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4061   %}
 4062   ins_pipe( pipe_slow );
 4063 %}
 4064 
 4065 // Store vectors generic operand pattern.
 4066 instruct storeV(memory mem, vec src) %{
 4067   match(Set mem (StoreVector mem src));
 4068   ins_cost(145);
 4069   format %{ "store_vector $mem,$src\n\t" %}
 4070   ins_encode %{
 4071     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4072       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4073       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4074       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4075       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4076       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4077       default: ShouldNotReachHere();
 4078     }
 4079   %}
 4080   ins_pipe( pipe_slow );
 4081 %}
 4082 
 4083 // ---------------------------------------- Gather ------------------------------------
 4084 
 4085 // Gather INT, LONG, FLOAT, DOUBLE
 4086 
 4087 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4088   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4089   match(Set dst (LoadVectorGather mem idx));
 4090   effect(TEMP dst, TEMP tmp, TEMP mask);
 4091   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4092   ins_encode %{
 4093     assert(UseAVX >= 2, "sanity");
 4094 
 4095     int vlen_enc = vector_length_encoding(this);
 4096     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4097 
 4098     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 4099     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4100 
 4101     if (vlen_enc == Assembler::AVX_128bit) {
 4102       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4103     } else {
 4104       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4105     }
 4106     __ lea($tmp$$Register, $mem$$Address);
 4107     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4108   %}
 4109   ins_pipe( pipe_slow );
 4110 %}
 4111 
 4112 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4113   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4114   match(Set dst (LoadVectorGather mem idx));
 4115   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4116   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4117   ins_encode %{
 4118     assert(UseAVX > 2, "sanity");
 4119 
 4120     int vlen_enc = vector_length_encoding(this);
 4121     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4122 
 4123     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4124 
 4125     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4126     __ lea($tmp$$Register, $mem$$Address);
 4127     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4128   %}
 4129   ins_pipe( pipe_slow );
 4130 %}
 4131 
 4132 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4133   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4134   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4135   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4136   ins_encode %{
 4137     assert(UseAVX > 2, "sanity");
 4138     int vlen_enc = vector_length_encoding(this);
 4139     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4140     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4141     // Note: Since gather instruction partially updates the opmask register used
 4142     // for predication hense moving mask operand to a temporary.
 4143     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4144     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4145     __ lea($tmp$$Register, $mem$$Address);
 4146     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4147   %}
 4148   ins_pipe( pipe_slow );
 4149 %}
 4150 // ====================Scatter=======================================
 4151 
 4152 // Scatter INT, LONG, FLOAT, DOUBLE
 4153 
 4154 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4155   predicate(UseAVX > 2);
 4156   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4157   effect(TEMP tmp, TEMP ktmp);
 4158   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4159   ins_encode %{
 4160     int vlen_enc = vector_length_encoding(this, $src);
 4161     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4162 
 4163     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4164     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4165 
 4166     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4167     __ lea($tmp$$Register, $mem$$Address);
 4168     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4169   %}
 4170   ins_pipe( pipe_slow );
 4171 %}
 4172 
 4173 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4174   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4175   effect(TEMP tmp, TEMP ktmp);
 4176   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4177   ins_encode %{
 4178     int vlen_enc = vector_length_encoding(this, $src);
 4179     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4180     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4181     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4182     // Note: Since scatter instruction partially updates the opmask register used
 4183     // for predication hense moving mask operand to a temporary.
 4184     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4185     __ lea($tmp$$Register, $mem$$Address);
 4186     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4187   %}
 4188   ins_pipe( pipe_slow );
 4189 %}
 4190 
 4191 // ====================REPLICATE=======================================
 4192 
 4193 // Replicate byte scalar to be vector
 4194 instruct vReplB_reg(vec dst, rRegI src) %{
 4195   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4196   match(Set dst (Replicate src));
 4197   format %{ "replicateB $dst,$src" %}
 4198   ins_encode %{
 4199     uint vlen = Matcher::vector_length(this);
 4200     if (UseAVX >= 2) {
 4201       int vlen_enc = vector_length_encoding(this);
 4202       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4203         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4204         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4205       } else {
 4206         __ movdl($dst$$XMMRegister, $src$$Register);
 4207         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4208       }
 4209     } else {
 4210        assert(UseAVX < 2, "");
 4211       __ movdl($dst$$XMMRegister, $src$$Register);
 4212       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4213       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4214       if (vlen >= 16) {
 4215         assert(vlen == 16, "");
 4216         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4217       }
 4218     }
 4219   %}
 4220   ins_pipe( pipe_slow );
 4221 %}
 4222 
 4223 instruct ReplB_mem(vec dst, memory mem) %{
 4224   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4225   match(Set dst (Replicate (LoadB mem)));
 4226   format %{ "replicateB $dst,$mem" %}
 4227   ins_encode %{
 4228     int vlen_enc = vector_length_encoding(this);
 4229     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4230   %}
 4231   ins_pipe( pipe_slow );
 4232 %}
 4233 
 4234 // ====================ReplicateS=======================================
 4235 
 4236 instruct vReplS_reg(vec dst, rRegI src) %{
 4237   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4238   match(Set dst (Replicate src));
 4239   format %{ "replicateS $dst,$src" %}
 4240   ins_encode %{
 4241     uint vlen = Matcher::vector_length(this);
 4242     int vlen_enc = vector_length_encoding(this);
 4243     if (UseAVX >= 2) {
 4244       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4245         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4246         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4247       } else {
 4248         __ movdl($dst$$XMMRegister, $src$$Register);
 4249         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4250       }
 4251     } else {
 4252       assert(UseAVX < 2, "");
 4253       __ movdl($dst$$XMMRegister, $src$$Register);
 4254       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4255       if (vlen >= 8) {
 4256         assert(vlen == 8, "");
 4257         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4258       }
 4259     }
 4260   %}
 4261   ins_pipe( pipe_slow );
 4262 %}
 4263 
 4264 instruct ReplS_mem(vec dst, memory mem) %{
 4265   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4266   match(Set dst (Replicate (LoadS mem)));
 4267   format %{ "replicateS $dst,$mem" %}
 4268   ins_encode %{
 4269     int vlen_enc = vector_length_encoding(this);
 4270     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4271   %}
 4272   ins_pipe( pipe_slow );
 4273 %}
 4274 
 4275 // ====================ReplicateI=======================================
 4276 
 4277 instruct ReplI_reg(vec dst, rRegI src) %{
 4278   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4279   match(Set dst (Replicate src));
 4280   format %{ "replicateI $dst,$src" %}
 4281   ins_encode %{
 4282     uint vlen = Matcher::vector_length(this);
 4283     int vlen_enc = vector_length_encoding(this);
 4284     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4285       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4286     } else if (VM_Version::supports_avx2()) {
 4287       __ movdl($dst$$XMMRegister, $src$$Register);
 4288       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4289     } else {
 4290       __ movdl($dst$$XMMRegister, $src$$Register);
 4291       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4292     }
 4293   %}
 4294   ins_pipe( pipe_slow );
 4295 %}
 4296 
 4297 instruct ReplI_mem(vec dst, memory mem) %{
 4298   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4299   match(Set dst (Replicate (LoadI mem)));
 4300   format %{ "replicateI $dst,$mem" %}
 4301   ins_encode %{
 4302     int vlen_enc = vector_length_encoding(this);
 4303     if (VM_Version::supports_avx2()) {
 4304       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4305     } else if (VM_Version::supports_avx()) {
 4306       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4307     } else {
 4308       __ movdl($dst$$XMMRegister, $mem$$Address);
 4309       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4310     }
 4311   %}
 4312   ins_pipe( pipe_slow );
 4313 %}
 4314 
 4315 instruct ReplI_imm(vec dst, immI con) %{
 4316   predicate(Matcher::is_non_long_integral_vector(n));
 4317   match(Set dst (Replicate con));
 4318   format %{ "replicateI $dst,$con" %}
 4319   ins_encode %{
 4320     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4321         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4322             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4323                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4324     BasicType bt = Matcher::vector_element_basic_type(this);
 4325     int vlen = Matcher::vector_length_in_bytes(this);
 4326     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4327   %}
 4328   ins_pipe( pipe_slow );
 4329 %}
 4330 
 4331 // Replicate scalar zero to be vector
 4332 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4333   predicate(Matcher::is_non_long_integral_vector(n));
 4334   match(Set dst (Replicate zero));
 4335   format %{ "replicateI $dst,$zero" %}
 4336   ins_encode %{
 4337     int vlen_enc = vector_length_encoding(this);
 4338     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4339       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4340     } else {
 4341       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4342     }
 4343   %}
 4344   ins_pipe( fpu_reg_reg );
 4345 %}
 4346 
 4347 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4348   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4349   match(Set dst (Replicate con));
 4350   format %{ "vallones $dst" %}
 4351   ins_encode %{
 4352     int vector_len = vector_length_encoding(this);
 4353     __ vallones($dst$$XMMRegister, vector_len);
 4354   %}
 4355   ins_pipe( pipe_slow );
 4356 %}
 4357 
 4358 // ====================ReplicateL=======================================
 4359 
 4360 #ifdef _LP64
 4361 // Replicate long (8 byte) scalar to be vector
 4362 instruct ReplL_reg(vec dst, rRegL src) %{
 4363   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4364   match(Set dst (Replicate src));
 4365   format %{ "replicateL $dst,$src" %}
 4366   ins_encode %{
 4367     int vlen = Matcher::vector_length(this);
 4368     int vlen_enc = vector_length_encoding(this);
 4369     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4370       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4371     } else if (VM_Version::supports_avx2()) {
 4372       __ movdq($dst$$XMMRegister, $src$$Register);
 4373       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4374     } else {
 4375       __ movdq($dst$$XMMRegister, $src$$Register);
 4376       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4377     }
 4378   %}
 4379   ins_pipe( pipe_slow );
 4380 %}
 4381 #else // _LP64
 4382 // Replicate long (8 byte) scalar to be vector
 4383 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4384   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4385   match(Set dst (Replicate src));
 4386   effect(TEMP dst, USE src, TEMP tmp);
 4387   format %{ "replicateL $dst,$src" %}
 4388   ins_encode %{
 4389     uint vlen = Matcher::vector_length(this);
 4390     if (vlen == 2) {
 4391       __ movdl($dst$$XMMRegister, $src$$Register);
 4392       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4393       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4394       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4395     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4396       int vlen_enc = Assembler::AVX_256bit;
 4397       __ movdl($dst$$XMMRegister, $src$$Register);
 4398       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4399       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4400       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4401     } else {
 4402       __ movdl($dst$$XMMRegister, $src$$Register);
 4403       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4404       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4405       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4406       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4407     }
 4408   %}
 4409   ins_pipe( pipe_slow );
 4410 %}
 4411 
 4412 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4413   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4414   match(Set dst (Replicate src));
 4415   effect(TEMP dst, USE src, TEMP tmp);
 4416   format %{ "replicateL $dst,$src" %}
 4417   ins_encode %{
 4418     if (VM_Version::supports_avx512vl()) {
 4419       __ movdl($dst$$XMMRegister, $src$$Register);
 4420       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4421       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4422       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4423       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4424       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4425     } else {
 4426       int vlen_enc = Assembler::AVX_512bit;
 4427       __ movdl($dst$$XMMRegister, $src$$Register);
 4428       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4429       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4430       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4431     }
 4432   %}
 4433   ins_pipe( pipe_slow );
 4434 %}
 4435 #endif // _LP64
 4436 
 4437 instruct ReplL_mem(vec dst, memory mem) %{
 4438   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4439   match(Set dst (Replicate (LoadL mem)));
 4440   format %{ "replicateL $dst,$mem" %}
 4441   ins_encode %{
 4442     int vlen_enc = vector_length_encoding(this);
 4443     if (VM_Version::supports_avx2()) {
 4444       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4445     } else if (VM_Version::supports_sse3()) {
 4446       __ movddup($dst$$XMMRegister, $mem$$Address);
 4447     } else {
 4448       __ movq($dst$$XMMRegister, $mem$$Address);
 4449       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4450     }
 4451   %}
 4452   ins_pipe( pipe_slow );
 4453 %}
 4454 
 4455 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4456 instruct ReplL_imm(vec dst, immL con) %{
 4457   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4458   match(Set dst (Replicate con));
 4459   format %{ "replicateL $dst,$con" %}
 4460   ins_encode %{
 4461     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4462     int vlen = Matcher::vector_length_in_bytes(this);
 4463     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4464   %}
 4465   ins_pipe( pipe_slow );
 4466 %}
 4467 
 4468 instruct ReplL_zero(vec dst, immL0 zero) %{
 4469   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4470   match(Set dst (Replicate zero));
 4471   format %{ "replicateL $dst,$zero" %}
 4472   ins_encode %{
 4473     int vlen_enc = vector_length_encoding(this);
 4474     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4475       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4476     } else {
 4477       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4478     }
 4479   %}
 4480   ins_pipe( fpu_reg_reg );
 4481 %}
 4482 
 4483 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4484   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4485   match(Set dst (Replicate con));
 4486   format %{ "vallones $dst" %}
 4487   ins_encode %{
 4488     int vector_len = vector_length_encoding(this);
 4489     __ vallones($dst$$XMMRegister, vector_len);
 4490   %}
 4491   ins_pipe( pipe_slow );
 4492 %}
 4493 
 4494 // ====================ReplicateF=======================================
 4495 
 4496 instruct vReplF_reg(vec dst, vlRegF src) %{
 4497   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4498   match(Set dst (Replicate src));
 4499   format %{ "replicateF $dst,$src" %}
 4500   ins_encode %{
 4501     uint vlen = Matcher::vector_length(this);
 4502     int vlen_enc = vector_length_encoding(this);
 4503     if (vlen <= 4) {
 4504       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4505     } else if (VM_Version::supports_avx2()) {
 4506       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4507     } else {
 4508       assert(vlen == 8, "sanity");
 4509       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4510       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4511     }
 4512   %}
 4513   ins_pipe( pipe_slow );
 4514 %}
 4515 
 4516 instruct ReplF_reg(vec dst, vlRegF src) %{
 4517   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4518   match(Set dst (Replicate src));
 4519   format %{ "replicateF $dst,$src" %}
 4520   ins_encode %{
 4521     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4522   %}
 4523   ins_pipe( pipe_slow );
 4524 %}
 4525 
 4526 instruct ReplF_mem(vec dst, memory mem) %{
 4527   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4528   match(Set dst (Replicate (LoadF mem)));
 4529   format %{ "replicateF $dst,$mem" %}
 4530   ins_encode %{
 4531     int vlen_enc = vector_length_encoding(this);
 4532     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4533   %}
 4534   ins_pipe( pipe_slow );
 4535 %}
 4536 
 4537 // Replicate float scalar immediate to be vector by loading from const table.
 4538 instruct ReplF_imm(vec dst, immF con) %{
 4539   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4540   match(Set dst (Replicate con));
 4541   format %{ "replicateF $dst,$con" %}
 4542   ins_encode %{
 4543     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4544         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4545     int vlen = Matcher::vector_length_in_bytes(this);
 4546     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4547   %}
 4548   ins_pipe( pipe_slow );
 4549 %}
 4550 
 4551 instruct ReplF_zero(vec dst, immF0 zero) %{
 4552   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4553   match(Set dst (Replicate zero));
 4554   format %{ "replicateF $dst,$zero" %}
 4555   ins_encode %{
 4556     int vlen_enc = vector_length_encoding(this);
 4557     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4558       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4559     } else {
 4560       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4561     }
 4562   %}
 4563   ins_pipe( fpu_reg_reg );
 4564 %}
 4565 
 4566 // ====================ReplicateD=======================================
 4567 
 4568 // Replicate double (8 bytes) scalar to be vector
 4569 instruct vReplD_reg(vec dst, vlRegD src) %{
 4570   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4571   match(Set dst (Replicate src));
 4572   format %{ "replicateD $dst,$src" %}
 4573   ins_encode %{
 4574     uint vlen = Matcher::vector_length(this);
 4575     int vlen_enc = vector_length_encoding(this);
 4576     if (vlen <= 2) {
 4577       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4578     } else if (VM_Version::supports_avx2()) {
 4579       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4580     } else {
 4581       assert(vlen == 4, "sanity");
 4582       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4583       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4584     }
 4585   %}
 4586   ins_pipe( pipe_slow );
 4587 %}
 4588 
 4589 instruct ReplD_reg(vec dst, vlRegD src) %{
 4590   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4591   match(Set dst (Replicate src));
 4592   format %{ "replicateD $dst,$src" %}
 4593   ins_encode %{
 4594     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4595   %}
 4596   ins_pipe( pipe_slow );
 4597 %}
 4598 
 4599 instruct ReplD_mem(vec dst, memory mem) %{
 4600   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4601   match(Set dst (Replicate (LoadD mem)));
 4602   format %{ "replicateD $dst,$mem" %}
 4603   ins_encode %{
 4604     if (Matcher::vector_length(this) >= 4) {
 4605       int vlen_enc = vector_length_encoding(this);
 4606       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4607     } else {
 4608       __ movddup($dst$$XMMRegister, $mem$$Address);
 4609     }
 4610   %}
 4611   ins_pipe( pipe_slow );
 4612 %}
 4613 
 4614 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4615 instruct ReplD_imm(vec dst, immD con) %{
 4616   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4617   match(Set dst (Replicate con));
 4618   format %{ "replicateD $dst,$con" %}
 4619   ins_encode %{
 4620     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4621     int vlen = Matcher::vector_length_in_bytes(this);
 4622     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4623   %}
 4624   ins_pipe( pipe_slow );
 4625 %}
 4626 
 4627 instruct ReplD_zero(vec dst, immD0 zero) %{
 4628   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4629   match(Set dst (Replicate zero));
 4630   format %{ "replicateD $dst,$zero" %}
 4631   ins_encode %{
 4632     int vlen_enc = vector_length_encoding(this);
 4633     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4634       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4635     } else {
 4636       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4637     }
 4638   %}
 4639   ins_pipe( fpu_reg_reg );
 4640 %}
 4641 
 4642 // ====================VECTOR INSERT=======================================
 4643 
 4644 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4645   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4646   match(Set dst (VectorInsert (Binary dst val) idx));
 4647   format %{ "vector_insert $dst,$val,$idx" %}
 4648   ins_encode %{
 4649     assert(UseSSE >= 4, "required");
 4650     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4651 
 4652     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4653 
 4654     assert(is_integral_type(elem_bt), "");
 4655     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4656 
 4657     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4658   %}
 4659   ins_pipe( pipe_slow );
 4660 %}
 4661 
 4662 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4663   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4664   match(Set dst (VectorInsert (Binary src val) idx));
 4665   effect(TEMP vtmp);
 4666   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4667   ins_encode %{
 4668     int vlen_enc = Assembler::AVX_256bit;
 4669     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4670     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4671     int log2epr = log2(elem_per_lane);
 4672 
 4673     assert(is_integral_type(elem_bt), "sanity");
 4674     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4675 
 4676     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4677     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4678     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4679     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4680     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4681   %}
 4682   ins_pipe( pipe_slow );
 4683 %}
 4684 
 4685 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4686   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4687   match(Set dst (VectorInsert (Binary src val) idx));
 4688   effect(TEMP vtmp);
 4689   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4690   ins_encode %{
 4691     assert(UseAVX > 2, "sanity");
 4692 
 4693     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4694     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4695     int log2epr = log2(elem_per_lane);
 4696 
 4697     assert(is_integral_type(elem_bt), "");
 4698     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4699 
 4700     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4701     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4702     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4703     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4704     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4705   %}
 4706   ins_pipe( pipe_slow );
 4707 %}
 4708 
 4709 #ifdef _LP64
 4710 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4711   predicate(Matcher::vector_length(n) == 2);
 4712   match(Set dst (VectorInsert (Binary dst val) idx));
 4713   format %{ "vector_insert $dst,$val,$idx" %}
 4714   ins_encode %{
 4715     assert(UseSSE >= 4, "required");
 4716     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4717     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4718 
 4719     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4720   %}
 4721   ins_pipe( pipe_slow );
 4722 %}
 4723 
 4724 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4725   predicate(Matcher::vector_length(n) == 4);
 4726   match(Set dst (VectorInsert (Binary src val) idx));
 4727   effect(TEMP vtmp);
 4728   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4729   ins_encode %{
 4730     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4731     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4732 
 4733     uint x_idx = $idx$$constant & right_n_bits(1);
 4734     uint y_idx = ($idx$$constant >> 1) & 1;
 4735     int vlen_enc = Assembler::AVX_256bit;
 4736     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4737     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4738     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4739   %}
 4740   ins_pipe( pipe_slow );
 4741 %}
 4742 
 4743 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4744   predicate(Matcher::vector_length(n) == 8);
 4745   match(Set dst (VectorInsert (Binary src val) idx));
 4746   effect(TEMP vtmp);
 4747   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4748   ins_encode %{
 4749     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4750     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4751 
 4752     uint x_idx = $idx$$constant & right_n_bits(1);
 4753     uint y_idx = ($idx$$constant >> 1) & 3;
 4754     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4755     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4756     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4757   %}
 4758   ins_pipe( pipe_slow );
 4759 %}
 4760 #endif
 4761 
 4762 instruct insertF(vec dst, regF val, immU8 idx) %{
 4763   predicate(Matcher::vector_length(n) < 8);
 4764   match(Set dst (VectorInsert (Binary dst val) idx));
 4765   format %{ "vector_insert $dst,$val,$idx" %}
 4766   ins_encode %{
 4767     assert(UseSSE >= 4, "sanity");
 4768 
 4769     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4770     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4771 
 4772     uint x_idx = $idx$$constant & right_n_bits(2);
 4773     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4774   %}
 4775   ins_pipe( pipe_slow );
 4776 %}
 4777 
 4778 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4779   predicate(Matcher::vector_length(n) >= 8);
 4780   match(Set dst (VectorInsert (Binary src val) idx));
 4781   effect(TEMP vtmp);
 4782   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4783   ins_encode %{
 4784     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4785     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4786 
 4787     int vlen = Matcher::vector_length(this);
 4788     uint x_idx = $idx$$constant & right_n_bits(2);
 4789     if (vlen == 8) {
 4790       uint y_idx = ($idx$$constant >> 2) & 1;
 4791       int vlen_enc = Assembler::AVX_256bit;
 4792       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4793       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4794       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4795     } else {
 4796       assert(vlen == 16, "sanity");
 4797       uint y_idx = ($idx$$constant >> 2) & 3;
 4798       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4799       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4800       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4801     }
 4802   %}
 4803   ins_pipe( pipe_slow );
 4804 %}
 4805 
 4806 #ifdef _LP64
 4807 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4808   predicate(Matcher::vector_length(n) == 2);
 4809   match(Set dst (VectorInsert (Binary dst val) idx));
 4810   effect(TEMP tmp);
 4811   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4812   ins_encode %{
 4813     assert(UseSSE >= 4, "sanity");
 4814     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4815     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4816 
 4817     __ movq($tmp$$Register, $val$$XMMRegister);
 4818     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4819   %}
 4820   ins_pipe( pipe_slow );
 4821 %}
 4822 
 4823 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4824   predicate(Matcher::vector_length(n) == 4);
 4825   match(Set dst (VectorInsert (Binary src val) idx));
 4826   effect(TEMP vtmp, TEMP tmp);
 4827   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4828   ins_encode %{
 4829     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4830     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4831 
 4832     uint x_idx = $idx$$constant & right_n_bits(1);
 4833     uint y_idx = ($idx$$constant >> 1) & 1;
 4834     int vlen_enc = Assembler::AVX_256bit;
 4835     __ movq($tmp$$Register, $val$$XMMRegister);
 4836     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4837     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4838     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4839   %}
 4840   ins_pipe( pipe_slow );
 4841 %}
 4842 
 4843 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4844   predicate(Matcher::vector_length(n) == 8);
 4845   match(Set dst (VectorInsert (Binary src val) idx));
 4846   effect(TEMP tmp, TEMP vtmp);
 4847   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4848   ins_encode %{
 4849     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4850     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4851 
 4852     uint x_idx = $idx$$constant & right_n_bits(1);
 4853     uint y_idx = ($idx$$constant >> 1) & 3;
 4854     __ movq($tmp$$Register, $val$$XMMRegister);
 4855     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4856     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4857     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4858   %}
 4859   ins_pipe( pipe_slow );
 4860 %}
 4861 #endif
 4862 
 4863 // ====================REDUCTION ARITHMETIC=======================================
 4864 
 4865 // =======================Int Reduction==========================================
 4866 
 4867 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4868   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4869   match(Set dst (AddReductionVI src1 src2));
 4870   match(Set dst (MulReductionVI src1 src2));
 4871   match(Set dst (AndReductionV  src1 src2));
 4872   match(Set dst ( OrReductionV  src1 src2));
 4873   match(Set dst (XorReductionV  src1 src2));
 4874   match(Set dst (MinReductionV  src1 src2));
 4875   match(Set dst (MaxReductionV  src1 src2));
 4876   effect(TEMP vtmp1, TEMP vtmp2);
 4877   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4878   ins_encode %{
 4879     int opcode = this->ideal_Opcode();
 4880     int vlen = Matcher::vector_length(this, $src2);
 4881     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4882   %}
 4883   ins_pipe( pipe_slow );
 4884 %}
 4885 
 4886 // =======================Long Reduction==========================================
 4887 
 4888 #ifdef _LP64
 4889 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4890   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4891   match(Set dst (AddReductionVL src1 src2));
 4892   match(Set dst (MulReductionVL src1 src2));
 4893   match(Set dst (AndReductionV  src1 src2));
 4894   match(Set dst ( OrReductionV  src1 src2));
 4895   match(Set dst (XorReductionV  src1 src2));
 4896   match(Set dst (MinReductionV  src1 src2));
 4897   match(Set dst (MaxReductionV  src1 src2));
 4898   effect(TEMP vtmp1, TEMP vtmp2);
 4899   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4900   ins_encode %{
 4901     int opcode = this->ideal_Opcode();
 4902     int vlen = Matcher::vector_length(this, $src2);
 4903     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4904   %}
 4905   ins_pipe( pipe_slow );
 4906 %}
 4907 
 4908 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4909   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4910   match(Set dst (AddReductionVL src1 src2));
 4911   match(Set dst (MulReductionVL src1 src2));
 4912   match(Set dst (AndReductionV  src1 src2));
 4913   match(Set dst ( OrReductionV  src1 src2));
 4914   match(Set dst (XorReductionV  src1 src2));
 4915   match(Set dst (MinReductionV  src1 src2));
 4916   match(Set dst (MaxReductionV  src1 src2));
 4917   effect(TEMP vtmp1, TEMP vtmp2);
 4918   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4919   ins_encode %{
 4920     int opcode = this->ideal_Opcode();
 4921     int vlen = Matcher::vector_length(this, $src2);
 4922     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4923   %}
 4924   ins_pipe( pipe_slow );
 4925 %}
 4926 #endif // _LP64
 4927 
 4928 // =======================Float Reduction==========================================
 4929 
 4930 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4931   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4932   match(Set dst (AddReductionVF dst src));
 4933   match(Set dst (MulReductionVF dst src));
 4934   effect(TEMP dst, TEMP vtmp);
 4935   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4936   ins_encode %{
 4937     int opcode = this->ideal_Opcode();
 4938     int vlen = Matcher::vector_length(this, $src);
 4939     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4940   %}
 4941   ins_pipe( pipe_slow );
 4942 %}
 4943 
 4944 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4945   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4946   match(Set dst (AddReductionVF dst src));
 4947   match(Set dst (MulReductionVF dst src));
 4948   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4949   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4950   ins_encode %{
 4951     int opcode = this->ideal_Opcode();
 4952     int vlen = Matcher::vector_length(this, $src);
 4953     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4954   %}
 4955   ins_pipe( pipe_slow );
 4956 %}
 4957 
 4958 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4959   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4960   match(Set dst (AddReductionVF dst src));
 4961   match(Set dst (MulReductionVF dst src));
 4962   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4963   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4964   ins_encode %{
 4965     int opcode = this->ideal_Opcode();
 4966     int vlen = Matcher::vector_length(this, $src);
 4967     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4968   %}
 4969   ins_pipe( pipe_slow );
 4970 %}
 4971 
 4972 // =======================Double Reduction==========================================
 4973 
 4974 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4975   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 4976   match(Set dst (AddReductionVD dst src));
 4977   match(Set dst (MulReductionVD dst src));
 4978   effect(TEMP dst, TEMP vtmp);
 4979   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 4980   ins_encode %{
 4981     int opcode = this->ideal_Opcode();
 4982     int vlen = Matcher::vector_length(this, $src);
 4983     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4984 %}
 4985   ins_pipe( pipe_slow );
 4986 %}
 4987 
 4988 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 4989   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 4990   match(Set dst (AddReductionVD dst src));
 4991   match(Set dst (MulReductionVD dst src));
 4992   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4993   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4994   ins_encode %{
 4995     int opcode = this->ideal_Opcode();
 4996     int vlen = Matcher::vector_length(this, $src);
 4997     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4998   %}
 4999   ins_pipe( pipe_slow );
 5000 %}
 5001 
 5002 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5003   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 5004   match(Set dst (AddReductionVD dst src));
 5005   match(Set dst (MulReductionVD dst src));
 5006   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5007   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5008   ins_encode %{
 5009     int opcode = this->ideal_Opcode();
 5010     int vlen = Matcher::vector_length(this, $src);
 5011     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5012   %}
 5013   ins_pipe( pipe_slow );
 5014 %}
 5015 
 5016 // =======================Byte Reduction==========================================
 5017 
 5018 #ifdef _LP64
 5019 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5020   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5021   match(Set dst (AddReductionVI src1 src2));
 5022   match(Set dst (AndReductionV  src1 src2));
 5023   match(Set dst ( OrReductionV  src1 src2));
 5024   match(Set dst (XorReductionV  src1 src2));
 5025   match(Set dst (MinReductionV  src1 src2));
 5026   match(Set dst (MaxReductionV  src1 src2));
 5027   effect(TEMP vtmp1, TEMP vtmp2);
 5028   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5029   ins_encode %{
 5030     int opcode = this->ideal_Opcode();
 5031     int vlen = Matcher::vector_length(this, $src2);
 5032     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5033   %}
 5034   ins_pipe( pipe_slow );
 5035 %}
 5036 
 5037 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5038   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5039   match(Set dst (AddReductionVI src1 src2));
 5040   match(Set dst (AndReductionV  src1 src2));
 5041   match(Set dst ( OrReductionV  src1 src2));
 5042   match(Set dst (XorReductionV  src1 src2));
 5043   match(Set dst (MinReductionV  src1 src2));
 5044   match(Set dst (MaxReductionV  src1 src2));
 5045   effect(TEMP vtmp1, TEMP vtmp2);
 5046   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5047   ins_encode %{
 5048     int opcode = this->ideal_Opcode();
 5049     int vlen = Matcher::vector_length(this, $src2);
 5050     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5051   %}
 5052   ins_pipe( pipe_slow );
 5053 %}
 5054 #endif
 5055 
 5056 // =======================Short Reduction==========================================
 5057 
 5058 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5059   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5060   match(Set dst (AddReductionVI src1 src2));
 5061   match(Set dst (MulReductionVI src1 src2));
 5062   match(Set dst (AndReductionV  src1 src2));
 5063   match(Set dst ( OrReductionV  src1 src2));
 5064   match(Set dst (XorReductionV  src1 src2));
 5065   match(Set dst (MinReductionV  src1 src2));
 5066   match(Set dst (MaxReductionV  src1 src2));
 5067   effect(TEMP vtmp1, TEMP vtmp2);
 5068   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5069   ins_encode %{
 5070     int opcode = this->ideal_Opcode();
 5071     int vlen = Matcher::vector_length(this, $src2);
 5072     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5073   %}
 5074   ins_pipe( pipe_slow );
 5075 %}
 5076 
 5077 // =======================Mul Reduction==========================================
 5078 
 5079 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5080   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5081             Matcher::vector_length(n->in(2)) <= 32); // src2
 5082   match(Set dst (MulReductionVI src1 src2));
 5083   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5084   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5085   ins_encode %{
 5086     int opcode = this->ideal_Opcode();
 5087     int vlen = Matcher::vector_length(this, $src2);
 5088     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5089   %}
 5090   ins_pipe( pipe_slow );
 5091 %}
 5092 
 5093 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5094   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5095             Matcher::vector_length(n->in(2)) == 64); // src2
 5096   match(Set dst (MulReductionVI src1 src2));
 5097   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5098   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5099   ins_encode %{
 5100     int opcode = this->ideal_Opcode();
 5101     int vlen = Matcher::vector_length(this, $src2);
 5102     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5103   %}
 5104   ins_pipe( pipe_slow );
 5105 %}
 5106 
 5107 //--------------------Min/Max Float Reduction --------------------
 5108 // Float Min Reduction
 5109 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5110                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5111   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5112             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5113              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5114             Matcher::vector_length(n->in(2)) == 2);
 5115   match(Set dst (MinReductionV src1 src2));
 5116   match(Set dst (MaxReductionV src1 src2));
 5117   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5118   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5119   ins_encode %{
 5120     assert(UseAVX > 0, "sanity");
 5121 
 5122     int opcode = this->ideal_Opcode();
 5123     int vlen = Matcher::vector_length(this, $src2);
 5124     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5125                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5126   %}
 5127   ins_pipe( pipe_slow );
 5128 %}
 5129 
 5130 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5131                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5132   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5133             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5134              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5135             Matcher::vector_length(n->in(2)) >= 4);
 5136   match(Set dst (MinReductionV src1 src2));
 5137   match(Set dst (MaxReductionV src1 src2));
 5138   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5139   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5140   ins_encode %{
 5141     assert(UseAVX > 0, "sanity");
 5142 
 5143     int opcode = this->ideal_Opcode();
 5144     int vlen = Matcher::vector_length(this, $src2);
 5145     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5146                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5147   %}
 5148   ins_pipe( pipe_slow );
 5149 %}
 5150 
 5151 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5152                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5153   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5154             Matcher::vector_length(n->in(2)) == 2);
 5155   match(Set dst (MinReductionV dst src));
 5156   match(Set dst (MaxReductionV dst src));
 5157   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5158   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5159   ins_encode %{
 5160     assert(UseAVX > 0, "sanity");
 5161 
 5162     int opcode = this->ideal_Opcode();
 5163     int vlen = Matcher::vector_length(this, $src);
 5164     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5165                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5166   %}
 5167   ins_pipe( pipe_slow );
 5168 %}
 5169 
 5170 
 5171 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5172                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5173   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5174             Matcher::vector_length(n->in(2)) >= 4);
 5175   match(Set dst (MinReductionV dst src));
 5176   match(Set dst (MaxReductionV dst src));
 5177   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5178   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5179   ins_encode %{
 5180     assert(UseAVX > 0, "sanity");
 5181 
 5182     int opcode = this->ideal_Opcode();
 5183     int vlen = Matcher::vector_length(this, $src);
 5184     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5185                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5186   %}
 5187   ins_pipe( pipe_slow );
 5188 %}
 5189 
 5190 
 5191 //--------------------Min Double Reduction --------------------
 5192 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5193                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5194                             rFlagsReg cr) %{
 5195   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5196             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5197              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5198             Matcher::vector_length(n->in(2)) == 2);
 5199   match(Set dst (MinReductionV src1 src2));
 5200   match(Set dst (MaxReductionV src1 src2));
 5201   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5202   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5203   ins_encode %{
 5204     assert(UseAVX > 0, "sanity");
 5205 
 5206     int opcode = this->ideal_Opcode();
 5207     int vlen = Matcher::vector_length(this, $src2);
 5208     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5209                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5210   %}
 5211   ins_pipe( pipe_slow );
 5212 %}
 5213 
 5214 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5215                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5216                            rFlagsReg cr) %{
 5217   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5218             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5219              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5220             Matcher::vector_length(n->in(2)) >= 4);
 5221   match(Set dst (MinReductionV src1 src2));
 5222   match(Set dst (MaxReductionV src1 src2));
 5223   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5224   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5225   ins_encode %{
 5226     assert(UseAVX > 0, "sanity");
 5227 
 5228     int opcode = this->ideal_Opcode();
 5229     int vlen = Matcher::vector_length(this, $src2);
 5230     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5231                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5232   %}
 5233   ins_pipe( pipe_slow );
 5234 %}
 5235 
 5236 
 5237 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5238                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5239                                rFlagsReg cr) %{
 5240   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5241             Matcher::vector_length(n->in(2)) == 2);
 5242   match(Set dst (MinReductionV dst src));
 5243   match(Set dst (MaxReductionV dst src));
 5244   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5245   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5246   ins_encode %{
 5247     assert(UseAVX > 0, "sanity");
 5248 
 5249     int opcode = this->ideal_Opcode();
 5250     int vlen = Matcher::vector_length(this, $src);
 5251     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5252                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5253   %}
 5254   ins_pipe( pipe_slow );
 5255 %}
 5256 
 5257 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5258                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5259                               rFlagsReg cr) %{
 5260   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5261             Matcher::vector_length(n->in(2)) >= 4);
 5262   match(Set dst (MinReductionV dst src));
 5263   match(Set dst (MaxReductionV dst src));
 5264   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5265   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5266   ins_encode %{
 5267     assert(UseAVX > 0, "sanity");
 5268 
 5269     int opcode = this->ideal_Opcode();
 5270     int vlen = Matcher::vector_length(this, $src);
 5271     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5272                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5273   %}
 5274   ins_pipe( pipe_slow );
 5275 %}
 5276 
 5277 // ====================VECTOR ARITHMETIC=======================================
 5278 
 5279 // --------------------------------- ADD --------------------------------------
 5280 
 5281 // Bytes vector add
 5282 instruct vaddB(vec dst, vec src) %{
 5283   predicate(UseAVX == 0);
 5284   match(Set dst (AddVB dst src));
 5285   format %{ "paddb   $dst,$src\t! add packedB" %}
 5286   ins_encode %{
 5287     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5288   %}
 5289   ins_pipe( pipe_slow );
 5290 %}
 5291 
 5292 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5293   predicate(UseAVX > 0);
 5294   match(Set dst (AddVB src1 src2));
 5295   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5296   ins_encode %{
 5297     int vlen_enc = vector_length_encoding(this);
 5298     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5299   %}
 5300   ins_pipe( pipe_slow );
 5301 %}
 5302 
 5303 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5304   predicate((UseAVX > 0) &&
 5305             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5306   match(Set dst (AddVB src (LoadVector mem)));
 5307   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5308   ins_encode %{
 5309     int vlen_enc = vector_length_encoding(this);
 5310     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5311   %}
 5312   ins_pipe( pipe_slow );
 5313 %}
 5314 
 5315 // Shorts/Chars vector add
 5316 instruct vaddS(vec dst, vec src) %{
 5317   predicate(UseAVX == 0);
 5318   match(Set dst (AddVS dst src));
 5319   format %{ "paddw   $dst,$src\t! add packedS" %}
 5320   ins_encode %{
 5321     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5322   %}
 5323   ins_pipe( pipe_slow );
 5324 %}
 5325 
 5326 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5327   predicate(UseAVX > 0);
 5328   match(Set dst (AddVS src1 src2));
 5329   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5330   ins_encode %{
 5331     int vlen_enc = vector_length_encoding(this);
 5332     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5333   %}
 5334   ins_pipe( pipe_slow );
 5335 %}
 5336 
 5337 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5338   predicate((UseAVX > 0) &&
 5339             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5340   match(Set dst (AddVS src (LoadVector mem)));
 5341   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5342   ins_encode %{
 5343     int vlen_enc = vector_length_encoding(this);
 5344     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5345   %}
 5346   ins_pipe( pipe_slow );
 5347 %}
 5348 
 5349 // Integers vector add
 5350 instruct vaddI(vec dst, vec src) %{
 5351   predicate(UseAVX == 0);
 5352   match(Set dst (AddVI dst src));
 5353   format %{ "paddd   $dst,$src\t! add packedI" %}
 5354   ins_encode %{
 5355     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5356   %}
 5357   ins_pipe( pipe_slow );
 5358 %}
 5359 
 5360 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5361   predicate(UseAVX > 0);
 5362   match(Set dst (AddVI src1 src2));
 5363   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5364   ins_encode %{
 5365     int vlen_enc = vector_length_encoding(this);
 5366     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5367   %}
 5368   ins_pipe( pipe_slow );
 5369 %}
 5370 
 5371 
 5372 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5373   predicate((UseAVX > 0) &&
 5374             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5375   match(Set dst (AddVI src (LoadVector mem)));
 5376   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5377   ins_encode %{
 5378     int vlen_enc = vector_length_encoding(this);
 5379     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5380   %}
 5381   ins_pipe( pipe_slow );
 5382 %}
 5383 
 5384 // Longs vector add
 5385 instruct vaddL(vec dst, vec src) %{
 5386   predicate(UseAVX == 0);
 5387   match(Set dst (AddVL dst src));
 5388   format %{ "paddq   $dst,$src\t! add packedL" %}
 5389   ins_encode %{
 5390     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5391   %}
 5392   ins_pipe( pipe_slow );
 5393 %}
 5394 
 5395 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5396   predicate(UseAVX > 0);
 5397   match(Set dst (AddVL src1 src2));
 5398   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5399   ins_encode %{
 5400     int vlen_enc = vector_length_encoding(this);
 5401     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5402   %}
 5403   ins_pipe( pipe_slow );
 5404 %}
 5405 
 5406 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5407   predicate((UseAVX > 0) &&
 5408             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5409   match(Set dst (AddVL src (LoadVector mem)));
 5410   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5411   ins_encode %{
 5412     int vlen_enc = vector_length_encoding(this);
 5413     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5414   %}
 5415   ins_pipe( pipe_slow );
 5416 %}
 5417 
 5418 // Floats vector add
 5419 instruct vaddF(vec dst, vec src) %{
 5420   predicate(UseAVX == 0);
 5421   match(Set dst (AddVF dst src));
 5422   format %{ "addps   $dst,$src\t! add packedF" %}
 5423   ins_encode %{
 5424     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5425   %}
 5426   ins_pipe( pipe_slow );
 5427 %}
 5428 
 5429 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5430   predicate(UseAVX > 0);
 5431   match(Set dst (AddVF src1 src2));
 5432   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5433   ins_encode %{
 5434     int vlen_enc = vector_length_encoding(this);
 5435     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5436   %}
 5437   ins_pipe( pipe_slow );
 5438 %}
 5439 
 5440 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5441   predicate((UseAVX > 0) &&
 5442             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5443   match(Set dst (AddVF src (LoadVector mem)));
 5444   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5445   ins_encode %{
 5446     int vlen_enc = vector_length_encoding(this);
 5447     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5448   %}
 5449   ins_pipe( pipe_slow );
 5450 %}
 5451 
 5452 // Doubles vector add
 5453 instruct vaddD(vec dst, vec src) %{
 5454   predicate(UseAVX == 0);
 5455   match(Set dst (AddVD dst src));
 5456   format %{ "addpd   $dst,$src\t! add packedD" %}
 5457   ins_encode %{
 5458     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5459   %}
 5460   ins_pipe( pipe_slow );
 5461 %}
 5462 
 5463 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5464   predicate(UseAVX > 0);
 5465   match(Set dst (AddVD src1 src2));
 5466   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5467   ins_encode %{
 5468     int vlen_enc = vector_length_encoding(this);
 5469     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5470   %}
 5471   ins_pipe( pipe_slow );
 5472 %}
 5473 
 5474 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5475   predicate((UseAVX > 0) &&
 5476             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5477   match(Set dst (AddVD src (LoadVector mem)));
 5478   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5479   ins_encode %{
 5480     int vlen_enc = vector_length_encoding(this);
 5481     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5482   %}
 5483   ins_pipe( pipe_slow );
 5484 %}
 5485 
 5486 // --------------------------------- SUB --------------------------------------
 5487 
 5488 // Bytes vector sub
 5489 instruct vsubB(vec dst, vec src) %{
 5490   predicate(UseAVX == 0);
 5491   match(Set dst (SubVB dst src));
 5492   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5493   ins_encode %{
 5494     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5495   %}
 5496   ins_pipe( pipe_slow );
 5497 %}
 5498 
 5499 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5500   predicate(UseAVX > 0);
 5501   match(Set dst (SubVB src1 src2));
 5502   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5503   ins_encode %{
 5504     int vlen_enc = vector_length_encoding(this);
 5505     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5506   %}
 5507   ins_pipe( pipe_slow );
 5508 %}
 5509 
 5510 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5511   predicate((UseAVX > 0) &&
 5512             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5513   match(Set dst (SubVB src (LoadVector mem)));
 5514   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5515   ins_encode %{
 5516     int vlen_enc = vector_length_encoding(this);
 5517     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5518   %}
 5519   ins_pipe( pipe_slow );
 5520 %}
 5521 
 5522 // Shorts/Chars vector sub
 5523 instruct vsubS(vec dst, vec src) %{
 5524   predicate(UseAVX == 0);
 5525   match(Set dst (SubVS dst src));
 5526   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5527   ins_encode %{
 5528     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5529   %}
 5530   ins_pipe( pipe_slow );
 5531 %}
 5532 
 5533 
 5534 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5535   predicate(UseAVX > 0);
 5536   match(Set dst (SubVS src1 src2));
 5537   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5538   ins_encode %{
 5539     int vlen_enc = vector_length_encoding(this);
 5540     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5541   %}
 5542   ins_pipe( pipe_slow );
 5543 %}
 5544 
 5545 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5546   predicate((UseAVX > 0) &&
 5547             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5548   match(Set dst (SubVS src (LoadVector mem)));
 5549   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5550   ins_encode %{
 5551     int vlen_enc = vector_length_encoding(this);
 5552     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5553   %}
 5554   ins_pipe( pipe_slow );
 5555 %}
 5556 
 5557 // Integers vector sub
 5558 instruct vsubI(vec dst, vec src) %{
 5559   predicate(UseAVX == 0);
 5560   match(Set dst (SubVI dst src));
 5561   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5562   ins_encode %{
 5563     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5564   %}
 5565   ins_pipe( pipe_slow );
 5566 %}
 5567 
 5568 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5569   predicate(UseAVX > 0);
 5570   match(Set dst (SubVI src1 src2));
 5571   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5572   ins_encode %{
 5573     int vlen_enc = vector_length_encoding(this);
 5574     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5575   %}
 5576   ins_pipe( pipe_slow );
 5577 %}
 5578 
 5579 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5580   predicate((UseAVX > 0) &&
 5581             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5582   match(Set dst (SubVI src (LoadVector mem)));
 5583   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5584   ins_encode %{
 5585     int vlen_enc = vector_length_encoding(this);
 5586     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5587   %}
 5588   ins_pipe( pipe_slow );
 5589 %}
 5590 
 5591 // Longs vector sub
 5592 instruct vsubL(vec dst, vec src) %{
 5593   predicate(UseAVX == 0);
 5594   match(Set dst (SubVL dst src));
 5595   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5596   ins_encode %{
 5597     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5598   %}
 5599   ins_pipe( pipe_slow );
 5600 %}
 5601 
 5602 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5603   predicate(UseAVX > 0);
 5604   match(Set dst (SubVL src1 src2));
 5605   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5606   ins_encode %{
 5607     int vlen_enc = vector_length_encoding(this);
 5608     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5609   %}
 5610   ins_pipe( pipe_slow );
 5611 %}
 5612 
 5613 
 5614 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5615   predicate((UseAVX > 0) &&
 5616             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5617   match(Set dst (SubVL src (LoadVector mem)));
 5618   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5619   ins_encode %{
 5620     int vlen_enc = vector_length_encoding(this);
 5621     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5622   %}
 5623   ins_pipe( pipe_slow );
 5624 %}
 5625 
 5626 // Floats vector sub
 5627 instruct vsubF(vec dst, vec src) %{
 5628   predicate(UseAVX == 0);
 5629   match(Set dst (SubVF dst src));
 5630   format %{ "subps   $dst,$src\t! sub packedF" %}
 5631   ins_encode %{
 5632     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5633   %}
 5634   ins_pipe( pipe_slow );
 5635 %}
 5636 
 5637 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5638   predicate(UseAVX > 0);
 5639   match(Set dst (SubVF src1 src2));
 5640   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5641   ins_encode %{
 5642     int vlen_enc = vector_length_encoding(this);
 5643     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5644   %}
 5645   ins_pipe( pipe_slow );
 5646 %}
 5647 
 5648 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5649   predicate((UseAVX > 0) &&
 5650             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5651   match(Set dst (SubVF src (LoadVector mem)));
 5652   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5653   ins_encode %{
 5654     int vlen_enc = vector_length_encoding(this);
 5655     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5656   %}
 5657   ins_pipe( pipe_slow );
 5658 %}
 5659 
 5660 // Doubles vector sub
 5661 instruct vsubD(vec dst, vec src) %{
 5662   predicate(UseAVX == 0);
 5663   match(Set dst (SubVD dst src));
 5664   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5665   ins_encode %{
 5666     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5667   %}
 5668   ins_pipe( pipe_slow );
 5669 %}
 5670 
 5671 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5672   predicate(UseAVX > 0);
 5673   match(Set dst (SubVD src1 src2));
 5674   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5675   ins_encode %{
 5676     int vlen_enc = vector_length_encoding(this);
 5677     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5678   %}
 5679   ins_pipe( pipe_slow );
 5680 %}
 5681 
 5682 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5683   predicate((UseAVX > 0) &&
 5684             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5685   match(Set dst (SubVD src (LoadVector mem)));
 5686   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5687   ins_encode %{
 5688     int vlen_enc = vector_length_encoding(this);
 5689     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5690   %}
 5691   ins_pipe( pipe_slow );
 5692 %}
 5693 
 5694 // --------------------------------- MUL --------------------------------------
 5695 
 5696 // Byte vector mul
 5697 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5698   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5699   match(Set dst (MulVB src1 src2));
 5700   effect(TEMP dst, TEMP xtmp);
 5701   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5702   ins_encode %{
 5703     assert(UseSSE > 3, "required");
 5704     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5705     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5706     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5707     __ psllw($dst$$XMMRegister, 8);
 5708     __ psrlw($dst$$XMMRegister, 8);
 5709     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5710   %}
 5711   ins_pipe( pipe_slow );
 5712 %}
 5713 
 5714 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5715   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5716   match(Set dst (MulVB src1 src2));
 5717   effect(TEMP dst, TEMP xtmp);
 5718   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5719   ins_encode %{
 5720     assert(UseSSE > 3, "required");
 5721     // Odd-index elements
 5722     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5723     __ psrlw($dst$$XMMRegister, 8);
 5724     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5725     __ psrlw($xtmp$$XMMRegister, 8);
 5726     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5727     __ psllw($dst$$XMMRegister, 8);
 5728     // Even-index elements
 5729     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5730     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5731     __ psllw($xtmp$$XMMRegister, 8);
 5732     __ psrlw($xtmp$$XMMRegister, 8);
 5733     // Combine
 5734     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5735   %}
 5736   ins_pipe( pipe_slow );
 5737 %}
 5738 
 5739 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5740   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5741   match(Set dst (MulVB src1 src2));
 5742   effect(TEMP xtmp1, TEMP xtmp2);
 5743   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5744   ins_encode %{
 5745     int vlen_enc = vector_length_encoding(this);
 5746     // Odd-index elements
 5747     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5748     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5749     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5750     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5751     // Even-index elements
 5752     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5753     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5754     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5755     // Combine
 5756     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5757   %}
 5758   ins_pipe( pipe_slow );
 5759 %}
 5760 
 5761 // Shorts/Chars vector mul
 5762 instruct vmulS(vec dst, vec src) %{
 5763   predicate(UseAVX == 0);
 5764   match(Set dst (MulVS dst src));
 5765   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5766   ins_encode %{
 5767     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5768   %}
 5769   ins_pipe( pipe_slow );
 5770 %}
 5771 
 5772 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5773   predicate(UseAVX > 0);
 5774   match(Set dst (MulVS src1 src2));
 5775   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5776   ins_encode %{
 5777     int vlen_enc = vector_length_encoding(this);
 5778     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5779   %}
 5780   ins_pipe( pipe_slow );
 5781 %}
 5782 
 5783 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5784   predicate((UseAVX > 0) &&
 5785             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5786   match(Set dst (MulVS src (LoadVector mem)));
 5787   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5788   ins_encode %{
 5789     int vlen_enc = vector_length_encoding(this);
 5790     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5791   %}
 5792   ins_pipe( pipe_slow );
 5793 %}
 5794 
 5795 // Integers vector mul
 5796 instruct vmulI(vec dst, vec src) %{
 5797   predicate(UseAVX == 0);
 5798   match(Set dst (MulVI dst src));
 5799   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5800   ins_encode %{
 5801     assert(UseSSE > 3, "required");
 5802     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5803   %}
 5804   ins_pipe( pipe_slow );
 5805 %}
 5806 
 5807 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5808   predicate(UseAVX > 0);
 5809   match(Set dst (MulVI src1 src2));
 5810   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5811   ins_encode %{
 5812     int vlen_enc = vector_length_encoding(this);
 5813     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5814   %}
 5815   ins_pipe( pipe_slow );
 5816 %}
 5817 
 5818 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5819   predicate((UseAVX > 0) &&
 5820             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5821   match(Set dst (MulVI src (LoadVector mem)));
 5822   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5823   ins_encode %{
 5824     int vlen_enc = vector_length_encoding(this);
 5825     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5826   %}
 5827   ins_pipe( pipe_slow );
 5828 %}
 5829 
 5830 // Longs vector mul
 5831 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 5832   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5833              VM_Version::supports_avx512dq()) ||
 5834             VM_Version::supports_avx512vldq());
 5835   match(Set dst (MulVL src1 src2));
 5836   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 5837   ins_encode %{
 5838     assert(UseAVX > 2, "required");
 5839     int vlen_enc = vector_length_encoding(this);
 5840     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5841   %}
 5842   ins_pipe( pipe_slow );
 5843 %}
 5844 
 5845 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 5846   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5847              VM_Version::supports_avx512dq()) ||
 5848             (Matcher::vector_length_in_bytes(n) > 8 &&
 5849              VM_Version::supports_avx512vldq()));
 5850   match(Set dst (MulVL src (LoadVector mem)));
 5851   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 5852   ins_encode %{
 5853     assert(UseAVX > 2, "required");
 5854     int vlen_enc = vector_length_encoding(this);
 5855     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5856   %}
 5857   ins_pipe( pipe_slow );
 5858 %}
 5859 
 5860 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 5861   predicate(UseAVX == 0);
 5862   match(Set dst (MulVL src1 src2));
 5863   effect(TEMP dst, TEMP xtmp);
 5864   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5865   ins_encode %{
 5866     assert(VM_Version::supports_sse4_1(), "required");
 5867     // Get the lo-hi products, only the lower 32 bits is in concerns
 5868     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 5869     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 5870     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 5871     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 5872     __ psllq($dst$$XMMRegister, 32);
 5873     // Get the lo-lo products
 5874     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5875     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 5876     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 5877   %}
 5878   ins_pipe( pipe_slow );
 5879 %}
 5880 
 5881 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5882   predicate(UseAVX > 0 &&
 5883             ((Matcher::vector_length_in_bytes(n) == 64 &&
 5884               !VM_Version::supports_avx512dq()) ||
 5885              (Matcher::vector_length_in_bytes(n) < 64 &&
 5886               !VM_Version::supports_avx512vldq())));
 5887   match(Set dst (MulVL src1 src2));
 5888   effect(TEMP xtmp1, TEMP xtmp2);
 5889   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5890   ins_encode %{
 5891     int vlen_enc = vector_length_encoding(this);
 5892     // Get the lo-hi products, only the lower 32 bits is in concerns
 5893     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 5894     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5895     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 5896     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5897     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 5898     // Get the lo-lo products
 5899     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5900     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5901   %}
 5902   ins_pipe( pipe_slow );
 5903 %}
 5904 
 5905 // Floats vector mul
 5906 instruct vmulF(vec dst, vec src) %{
 5907   predicate(UseAVX == 0);
 5908   match(Set dst (MulVF dst src));
 5909   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5910   ins_encode %{
 5911     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5912   %}
 5913   ins_pipe( pipe_slow );
 5914 %}
 5915 
 5916 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5917   predicate(UseAVX > 0);
 5918   match(Set dst (MulVF src1 src2));
 5919   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5920   ins_encode %{
 5921     int vlen_enc = vector_length_encoding(this);
 5922     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5923   %}
 5924   ins_pipe( pipe_slow );
 5925 %}
 5926 
 5927 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5928   predicate((UseAVX > 0) &&
 5929             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5930   match(Set dst (MulVF src (LoadVector mem)));
 5931   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5932   ins_encode %{
 5933     int vlen_enc = vector_length_encoding(this);
 5934     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5935   %}
 5936   ins_pipe( pipe_slow );
 5937 %}
 5938 
 5939 // Doubles vector mul
 5940 instruct vmulD(vec dst, vec src) %{
 5941   predicate(UseAVX == 0);
 5942   match(Set dst (MulVD dst src));
 5943   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5944   ins_encode %{
 5945     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5946   %}
 5947   ins_pipe( pipe_slow );
 5948 %}
 5949 
 5950 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5951   predicate(UseAVX > 0);
 5952   match(Set dst (MulVD src1 src2));
 5953   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5954   ins_encode %{
 5955     int vlen_enc = vector_length_encoding(this);
 5956     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5957   %}
 5958   ins_pipe( pipe_slow );
 5959 %}
 5960 
 5961 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5962   predicate((UseAVX > 0) &&
 5963             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5964   match(Set dst (MulVD src (LoadVector mem)));
 5965   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 5966   ins_encode %{
 5967     int vlen_enc = vector_length_encoding(this);
 5968     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5969   %}
 5970   ins_pipe( pipe_slow );
 5971 %}
 5972 
 5973 // --------------------------------- DIV --------------------------------------
 5974 
 5975 // Floats vector div
 5976 instruct vdivF(vec dst, vec src) %{
 5977   predicate(UseAVX == 0);
 5978   match(Set dst (DivVF dst src));
 5979   format %{ "divps   $dst,$src\t! div packedF" %}
 5980   ins_encode %{
 5981     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 5982   %}
 5983   ins_pipe( pipe_slow );
 5984 %}
 5985 
 5986 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 5987   predicate(UseAVX > 0);
 5988   match(Set dst (DivVF src1 src2));
 5989   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 5990   ins_encode %{
 5991     int vlen_enc = vector_length_encoding(this);
 5992     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5993   %}
 5994   ins_pipe( pipe_slow );
 5995 %}
 5996 
 5997 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 5998   predicate((UseAVX > 0) &&
 5999             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6000   match(Set dst (DivVF src (LoadVector mem)));
 6001   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6002   ins_encode %{
 6003     int vlen_enc = vector_length_encoding(this);
 6004     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6005   %}
 6006   ins_pipe( pipe_slow );
 6007 %}
 6008 
 6009 // Doubles vector div
 6010 instruct vdivD(vec dst, vec src) %{
 6011   predicate(UseAVX == 0);
 6012   match(Set dst (DivVD dst src));
 6013   format %{ "divpd   $dst,$src\t! div packedD" %}
 6014   ins_encode %{
 6015     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6016   %}
 6017   ins_pipe( pipe_slow );
 6018 %}
 6019 
 6020 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6021   predicate(UseAVX > 0);
 6022   match(Set dst (DivVD src1 src2));
 6023   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6024   ins_encode %{
 6025     int vlen_enc = vector_length_encoding(this);
 6026     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6027   %}
 6028   ins_pipe( pipe_slow );
 6029 %}
 6030 
 6031 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6032   predicate((UseAVX > 0) &&
 6033             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6034   match(Set dst (DivVD src (LoadVector mem)));
 6035   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6036   ins_encode %{
 6037     int vlen_enc = vector_length_encoding(this);
 6038     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6039   %}
 6040   ins_pipe( pipe_slow );
 6041 %}
 6042 
 6043 // ------------------------------ MinMax ---------------------------------------
 6044 
 6045 // Byte, Short, Int vector Min/Max
 6046 instruct minmax_reg_sse(vec dst, vec src) %{
 6047   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6048             UseAVX == 0);
 6049   match(Set dst (MinV dst src));
 6050   match(Set dst (MaxV dst src));
 6051   format %{ "vector_minmax  $dst,$src\t!  " %}
 6052   ins_encode %{
 6053     assert(UseSSE >= 4, "required");
 6054 
 6055     int opcode = this->ideal_Opcode();
 6056     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6057     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6058   %}
 6059   ins_pipe( pipe_slow );
 6060 %}
 6061 
 6062 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6063   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6064             UseAVX > 0);
 6065   match(Set dst (MinV src1 src2));
 6066   match(Set dst (MaxV src1 src2));
 6067   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6068   ins_encode %{
 6069     int opcode = this->ideal_Opcode();
 6070     int vlen_enc = vector_length_encoding(this);
 6071     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6072 
 6073     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6074   %}
 6075   ins_pipe( pipe_slow );
 6076 %}
 6077 
 6078 // Long vector Min/Max
 6079 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6080   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6081             UseAVX == 0);
 6082   match(Set dst (MinV dst src));
 6083   match(Set dst (MaxV src dst));
 6084   effect(TEMP dst, TEMP tmp);
 6085   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6086   ins_encode %{
 6087     assert(UseSSE >= 4, "required");
 6088 
 6089     int opcode = this->ideal_Opcode();
 6090     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6091     assert(elem_bt == T_LONG, "sanity");
 6092 
 6093     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6094   %}
 6095   ins_pipe( pipe_slow );
 6096 %}
 6097 
 6098 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6099   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6100             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6101   match(Set dst (MinV src1 src2));
 6102   match(Set dst (MaxV src1 src2));
 6103   effect(TEMP dst);
 6104   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6105   ins_encode %{
 6106     int vlen_enc = vector_length_encoding(this);
 6107     int opcode = this->ideal_Opcode();
 6108     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6109     assert(elem_bt == T_LONG, "sanity");
 6110 
 6111     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6112   %}
 6113   ins_pipe( pipe_slow );
 6114 %}
 6115 
 6116 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6117   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6118             Matcher::vector_element_basic_type(n) == T_LONG);
 6119   match(Set dst (MinV src1 src2));
 6120   match(Set dst (MaxV src1 src2));
 6121   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6122   ins_encode %{
 6123     assert(UseAVX > 2, "required");
 6124 
 6125     int vlen_enc = vector_length_encoding(this);
 6126     int opcode = this->ideal_Opcode();
 6127     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6128     assert(elem_bt == T_LONG, "sanity");
 6129 
 6130     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6131   %}
 6132   ins_pipe( pipe_slow );
 6133 %}
 6134 
 6135 // Float/Double vector Min/Max
 6136 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6137   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6138             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6139             UseAVX > 0);
 6140   match(Set dst (MinV a b));
 6141   match(Set dst (MaxV a b));
 6142   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6143   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6144   ins_encode %{
 6145     assert(UseAVX > 0, "required");
 6146 
 6147     int opcode = this->ideal_Opcode();
 6148     int vlen_enc = vector_length_encoding(this);
 6149     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6150 
 6151     __ vminmax_fp(opcode, elem_bt,
 6152                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6153                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6154   %}
 6155   ins_pipe( pipe_slow );
 6156 %}
 6157 
 6158 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6159   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6160             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6161   match(Set dst (MinV a b));
 6162   match(Set dst (MaxV a b));
 6163   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6164   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6165   ins_encode %{
 6166     assert(UseAVX > 2, "required");
 6167 
 6168     int opcode = this->ideal_Opcode();
 6169     int vlen_enc = vector_length_encoding(this);
 6170     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6171 
 6172     __ evminmax_fp(opcode, elem_bt,
 6173                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6174                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6175   %}
 6176   ins_pipe( pipe_slow );
 6177 %}
 6178 
 6179 // --------------------------------- Signum/CopySign ---------------------------
 6180 
 6181 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6182   match(Set dst (SignumF dst (Binary zero one)));
 6183   effect(KILL cr);
 6184   format %{ "signumF $dst, $dst" %}
 6185   ins_encode %{
 6186     int opcode = this->ideal_Opcode();
 6187     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6188   %}
 6189   ins_pipe( pipe_slow );
 6190 %}
 6191 
 6192 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6193   match(Set dst (SignumD dst (Binary zero one)));
 6194   effect(KILL cr);
 6195   format %{ "signumD $dst, $dst" %}
 6196   ins_encode %{
 6197     int opcode = this->ideal_Opcode();
 6198     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6199   %}
 6200   ins_pipe( pipe_slow );
 6201 %}
 6202 
 6203 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6204   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6205   match(Set dst (SignumVF src (Binary zero one)));
 6206   match(Set dst (SignumVD src (Binary zero one)));
 6207   effect(TEMP dst, TEMP xtmp1);
 6208   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6209   ins_encode %{
 6210     int opcode = this->ideal_Opcode();
 6211     int vec_enc = vector_length_encoding(this);
 6212     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6213                          $xtmp1$$XMMRegister, vec_enc);
 6214   %}
 6215   ins_pipe( pipe_slow );
 6216 %}
 6217 
 6218 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6219   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6220   match(Set dst (SignumVF src (Binary zero one)));
 6221   match(Set dst (SignumVD src (Binary zero one)));
 6222   effect(TEMP dst, TEMP ktmp1);
 6223   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6224   ins_encode %{
 6225     int opcode = this->ideal_Opcode();
 6226     int vec_enc = vector_length_encoding(this);
 6227     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6228                           $ktmp1$$KRegister, vec_enc);
 6229   %}
 6230   ins_pipe( pipe_slow );
 6231 %}
 6232 
 6233 // ---------------------------------------
 6234 // For copySign use 0xE4 as writemask for vpternlog
 6235 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6236 // C (xmm2) is set to 0x7FFFFFFF
 6237 // Wherever xmm2 is 0, we want to pick from B (sign)
 6238 // Wherever xmm2 is 1, we want to pick from A (src)
 6239 //
 6240 // A B C Result
 6241 // 0 0 0 0
 6242 // 0 0 1 0
 6243 // 0 1 0 1
 6244 // 0 1 1 0
 6245 // 1 0 0 0
 6246 // 1 0 1 1
 6247 // 1 1 0 1
 6248 // 1 1 1 1
 6249 //
 6250 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6251 // ---------------------------------------
 6252 
 6253 #ifdef _LP64
 6254 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6255   match(Set dst (CopySignF dst src));
 6256   effect(TEMP tmp1, TEMP tmp2);
 6257   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6258   ins_encode %{
 6259     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6260     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6261     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6262   %}
 6263   ins_pipe( pipe_slow );
 6264 %}
 6265 
 6266 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6267   match(Set dst (CopySignD dst (Binary src zero)));
 6268   ins_cost(100);
 6269   effect(TEMP tmp1, TEMP tmp2);
 6270   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6271   ins_encode %{
 6272     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6273     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6274     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6275   %}
 6276   ins_pipe( pipe_slow );
 6277 %}
 6278 
 6279 #endif // _LP64
 6280 
 6281 //----------------------------- CompressBits/ExpandBits ------------------------
 6282 
 6283 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6284   predicate(n->bottom_type()->isa_int());
 6285   match(Set dst (CompressBits src mask));
 6286   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6287   ins_encode %{
 6288     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6289   %}
 6290   ins_pipe( pipe_slow );
 6291 %}
 6292 
 6293 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6294   predicate(n->bottom_type()->isa_int());
 6295   match(Set dst (ExpandBits src mask));
 6296   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6297   ins_encode %{
 6298     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6299   %}
 6300   ins_pipe( pipe_slow );
 6301 %}
 6302 
 6303 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6304   predicate(n->bottom_type()->isa_int());
 6305   match(Set dst (CompressBits src (LoadI mask)));
 6306   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6307   ins_encode %{
 6308     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6309   %}
 6310   ins_pipe( pipe_slow );
 6311 %}
 6312 
 6313 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6314   predicate(n->bottom_type()->isa_int());
 6315   match(Set dst (ExpandBits src (LoadI mask)));
 6316   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6317   ins_encode %{
 6318     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6319   %}
 6320   ins_pipe( pipe_slow );
 6321 %}
 6322 
 6323 // --------------------------------- Sqrt --------------------------------------
 6324 
 6325 instruct vsqrtF_reg(vec dst, vec src) %{
 6326   match(Set dst (SqrtVF src));
 6327   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6328   ins_encode %{
 6329     assert(UseAVX > 0, "required");
 6330     int vlen_enc = vector_length_encoding(this);
 6331     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6332   %}
 6333   ins_pipe( pipe_slow );
 6334 %}
 6335 
 6336 instruct vsqrtF_mem(vec dst, memory mem) %{
 6337   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6338   match(Set dst (SqrtVF (LoadVector mem)));
 6339   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6340   ins_encode %{
 6341     assert(UseAVX > 0, "required");
 6342     int vlen_enc = vector_length_encoding(this);
 6343     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6344   %}
 6345   ins_pipe( pipe_slow );
 6346 %}
 6347 
 6348 // Floating point vector sqrt
 6349 instruct vsqrtD_reg(vec dst, vec src) %{
 6350   match(Set dst (SqrtVD src));
 6351   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6352   ins_encode %{
 6353     assert(UseAVX > 0, "required");
 6354     int vlen_enc = vector_length_encoding(this);
 6355     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6356   %}
 6357   ins_pipe( pipe_slow );
 6358 %}
 6359 
 6360 instruct vsqrtD_mem(vec dst, memory mem) %{
 6361   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6362   match(Set dst (SqrtVD (LoadVector mem)));
 6363   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6364   ins_encode %{
 6365     assert(UseAVX > 0, "required");
 6366     int vlen_enc = vector_length_encoding(this);
 6367     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6368   %}
 6369   ins_pipe( pipe_slow );
 6370 %}
 6371 
 6372 // ------------------------------ Shift ---------------------------------------
 6373 
 6374 // Left and right shift count vectors are the same on x86
 6375 // (only lowest bits of xmm reg are used for count).
 6376 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6377   match(Set dst (LShiftCntV cnt));
 6378   match(Set dst (RShiftCntV cnt));
 6379   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6380   ins_encode %{
 6381     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6382   %}
 6383   ins_pipe( pipe_slow );
 6384 %}
 6385 
 6386 // Byte vector shift
 6387 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6388   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6389   match(Set dst ( LShiftVB src shift));
 6390   match(Set dst ( RShiftVB src shift));
 6391   match(Set dst (URShiftVB src shift));
 6392   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6393   format %{"vector_byte_shift $dst,$src,$shift" %}
 6394   ins_encode %{
 6395     assert(UseSSE > 3, "required");
 6396     int opcode = this->ideal_Opcode();
 6397     bool sign = (opcode != Op_URShiftVB);
 6398     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6399     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6400     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6401     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6402     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6403   %}
 6404   ins_pipe( pipe_slow );
 6405 %}
 6406 
 6407 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6408   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6409             UseAVX <= 1);
 6410   match(Set dst ( LShiftVB src shift));
 6411   match(Set dst ( RShiftVB src shift));
 6412   match(Set dst (URShiftVB src shift));
 6413   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6414   format %{"vector_byte_shift $dst,$src,$shift" %}
 6415   ins_encode %{
 6416     assert(UseSSE > 3, "required");
 6417     int opcode = this->ideal_Opcode();
 6418     bool sign = (opcode != Op_URShiftVB);
 6419     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6420     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6421     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6422     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6423     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6424     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6425     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6426     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6427     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6428   %}
 6429   ins_pipe( pipe_slow );
 6430 %}
 6431 
 6432 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6433   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6434             UseAVX > 1);
 6435   match(Set dst ( LShiftVB src shift));
 6436   match(Set dst ( RShiftVB src shift));
 6437   match(Set dst (URShiftVB src shift));
 6438   effect(TEMP dst, TEMP tmp);
 6439   format %{"vector_byte_shift $dst,$src,$shift" %}
 6440   ins_encode %{
 6441     int opcode = this->ideal_Opcode();
 6442     bool sign = (opcode != Op_URShiftVB);
 6443     int vlen_enc = Assembler::AVX_256bit;
 6444     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6445     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6446     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6447     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6448     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6449   %}
 6450   ins_pipe( pipe_slow );
 6451 %}
 6452 
 6453 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6454   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6455   match(Set dst ( LShiftVB src shift));
 6456   match(Set dst ( RShiftVB src shift));
 6457   match(Set dst (URShiftVB src shift));
 6458   effect(TEMP dst, TEMP tmp);
 6459   format %{"vector_byte_shift $dst,$src,$shift" %}
 6460   ins_encode %{
 6461     assert(UseAVX > 1, "required");
 6462     int opcode = this->ideal_Opcode();
 6463     bool sign = (opcode != Op_URShiftVB);
 6464     int vlen_enc = Assembler::AVX_256bit;
 6465     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6466     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6467     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6468     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6469     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6470     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6471     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6472     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6473     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6474   %}
 6475   ins_pipe( pipe_slow );
 6476 %}
 6477 
 6478 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6479   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6480   match(Set dst ( LShiftVB src shift));
 6481   match(Set dst  (RShiftVB src shift));
 6482   match(Set dst (URShiftVB src shift));
 6483   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6484   format %{"vector_byte_shift $dst,$src,$shift" %}
 6485   ins_encode %{
 6486     assert(UseAVX > 2, "required");
 6487     int opcode = this->ideal_Opcode();
 6488     bool sign = (opcode != Op_URShiftVB);
 6489     int vlen_enc = Assembler::AVX_512bit;
 6490     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6491     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6492     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6493     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6494     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6495     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6496     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6497     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6498     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6499     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6500     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6501     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6502   %}
 6503   ins_pipe( pipe_slow );
 6504 %}
 6505 
 6506 // Shorts vector logical right shift produces incorrect Java result
 6507 // for negative data because java code convert short value into int with
 6508 // sign extension before a shift. But char vectors are fine since chars are
 6509 // unsigned values.
 6510 // Shorts/Chars vector left shift
 6511 instruct vshiftS(vec dst, vec src, vec shift) %{
 6512   predicate(!n->as_ShiftV()->is_var_shift());
 6513   match(Set dst ( LShiftVS src shift));
 6514   match(Set dst ( RShiftVS src shift));
 6515   match(Set dst (URShiftVS src shift));
 6516   effect(TEMP dst, USE src, USE shift);
 6517   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6518   ins_encode %{
 6519     int opcode = this->ideal_Opcode();
 6520     if (UseAVX > 0) {
 6521       int vlen_enc = vector_length_encoding(this);
 6522       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6523     } else {
 6524       int vlen = Matcher::vector_length(this);
 6525       if (vlen == 2) {
 6526         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6527         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6528       } else if (vlen == 4) {
 6529         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6530         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6531       } else {
 6532         assert (vlen == 8, "sanity");
 6533         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6534         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6535       }
 6536     }
 6537   %}
 6538   ins_pipe( pipe_slow );
 6539 %}
 6540 
 6541 // Integers vector left shift
 6542 instruct vshiftI(vec dst, vec src, vec shift) %{
 6543   predicate(!n->as_ShiftV()->is_var_shift());
 6544   match(Set dst ( LShiftVI src shift));
 6545   match(Set dst ( RShiftVI src shift));
 6546   match(Set dst (URShiftVI src shift));
 6547   effect(TEMP dst, USE src, USE shift);
 6548   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6549   ins_encode %{
 6550     int opcode = this->ideal_Opcode();
 6551     if (UseAVX > 0) {
 6552       int vlen_enc = vector_length_encoding(this);
 6553       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6554     } else {
 6555       int vlen = Matcher::vector_length(this);
 6556       if (vlen == 2) {
 6557         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6558         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6559       } else {
 6560         assert(vlen == 4, "sanity");
 6561         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6562         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6563       }
 6564     }
 6565   %}
 6566   ins_pipe( pipe_slow );
 6567 %}
 6568 
 6569 // Integers vector left constant shift
 6570 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6571   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6572   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6573   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6574   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6575   ins_encode %{
 6576     int opcode = this->ideal_Opcode();
 6577     if (UseAVX > 0) {
 6578       int vector_len = vector_length_encoding(this);
 6579       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6580     } else {
 6581       int vlen = Matcher::vector_length(this);
 6582       if (vlen == 2) {
 6583         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6584         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6585       } else {
 6586         assert(vlen == 4, "sanity");
 6587         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6588         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6589       }
 6590     }
 6591   %}
 6592   ins_pipe( pipe_slow );
 6593 %}
 6594 
 6595 // Longs vector shift
 6596 instruct vshiftL(vec dst, vec src, vec shift) %{
 6597   predicate(!n->as_ShiftV()->is_var_shift());
 6598   match(Set dst ( LShiftVL src shift));
 6599   match(Set dst (URShiftVL src shift));
 6600   effect(TEMP dst, USE src, USE shift);
 6601   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6602   ins_encode %{
 6603     int opcode = this->ideal_Opcode();
 6604     if (UseAVX > 0) {
 6605       int vlen_enc = vector_length_encoding(this);
 6606       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6607     } else {
 6608       assert(Matcher::vector_length(this) == 2, "");
 6609       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6610       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6611     }
 6612   %}
 6613   ins_pipe( pipe_slow );
 6614 %}
 6615 
 6616 // Longs vector constant shift
 6617 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6618   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6619   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6620   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6621   ins_encode %{
 6622     int opcode = this->ideal_Opcode();
 6623     if (UseAVX > 0) {
 6624       int vector_len = vector_length_encoding(this);
 6625       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6626     } else {
 6627       assert(Matcher::vector_length(this) == 2, "");
 6628       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6629       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6630     }
 6631   %}
 6632   ins_pipe( pipe_slow );
 6633 %}
 6634 
 6635 // -------------------ArithmeticRightShift -----------------------------------
 6636 // Long vector arithmetic right shift
 6637 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6638   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6639   match(Set dst (RShiftVL src shift));
 6640   effect(TEMP dst, TEMP tmp);
 6641   format %{ "vshiftq $dst,$src,$shift" %}
 6642   ins_encode %{
 6643     uint vlen = Matcher::vector_length(this);
 6644     if (vlen == 2) {
 6645       assert(UseSSE >= 2, "required");
 6646       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6647       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6648       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6649       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6650       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6651       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6652     } else {
 6653       assert(vlen == 4, "sanity");
 6654       assert(UseAVX > 1, "required");
 6655       int vlen_enc = Assembler::AVX_256bit;
 6656       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6657       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6658       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6659       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6660       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6661     }
 6662   %}
 6663   ins_pipe( pipe_slow );
 6664 %}
 6665 
 6666 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6667   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6668   match(Set dst (RShiftVL src shift));
 6669   format %{ "vshiftq $dst,$src,$shift" %}
 6670   ins_encode %{
 6671     int vlen_enc = vector_length_encoding(this);
 6672     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6673   %}
 6674   ins_pipe( pipe_slow );
 6675 %}
 6676 
 6677 // ------------------- Variable Shift -----------------------------
 6678 // Byte variable shift
 6679 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6680   predicate(Matcher::vector_length(n) <= 8 &&
 6681             n->as_ShiftV()->is_var_shift() &&
 6682             !VM_Version::supports_avx512bw());
 6683   match(Set dst ( LShiftVB src shift));
 6684   match(Set dst ( RShiftVB src shift));
 6685   match(Set dst (URShiftVB src shift));
 6686   effect(TEMP dst, TEMP vtmp);
 6687   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6688   ins_encode %{
 6689     assert(UseAVX >= 2, "required");
 6690 
 6691     int opcode = this->ideal_Opcode();
 6692     int vlen_enc = Assembler::AVX_128bit;
 6693     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6694     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6695   %}
 6696   ins_pipe( pipe_slow );
 6697 %}
 6698 
 6699 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6700   predicate(Matcher::vector_length(n) == 16 &&
 6701             n->as_ShiftV()->is_var_shift() &&
 6702             !VM_Version::supports_avx512bw());
 6703   match(Set dst ( LShiftVB src shift));
 6704   match(Set dst ( RShiftVB src shift));
 6705   match(Set dst (URShiftVB src shift));
 6706   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6707   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6708   ins_encode %{
 6709     assert(UseAVX >= 2, "required");
 6710 
 6711     int opcode = this->ideal_Opcode();
 6712     int vlen_enc = Assembler::AVX_128bit;
 6713     // Shift lower half and get word result in dst
 6714     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6715 
 6716     // Shift upper half and get word result in vtmp1
 6717     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6718     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6719     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6720 
 6721     // Merge and down convert the two word results to byte in dst
 6722     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6723   %}
 6724   ins_pipe( pipe_slow );
 6725 %}
 6726 
 6727 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6728   predicate(Matcher::vector_length(n) == 32 &&
 6729             n->as_ShiftV()->is_var_shift() &&
 6730             !VM_Version::supports_avx512bw());
 6731   match(Set dst ( LShiftVB src shift));
 6732   match(Set dst ( RShiftVB src shift));
 6733   match(Set dst (URShiftVB src shift));
 6734   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6735   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6736   ins_encode %{
 6737     assert(UseAVX >= 2, "required");
 6738 
 6739     int opcode = this->ideal_Opcode();
 6740     int vlen_enc = Assembler::AVX_128bit;
 6741     // Process lower 128 bits and get result in dst
 6742     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6743     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6744     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6745     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6746     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6747 
 6748     // Process higher 128 bits and get result in vtmp3
 6749     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6750     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6751     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6752     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6753     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6754     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6755     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6756 
 6757     // Merge the two results in dst
 6758     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6759   %}
 6760   ins_pipe( pipe_slow );
 6761 %}
 6762 
 6763 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6764   predicate(Matcher::vector_length(n) <= 32 &&
 6765             n->as_ShiftV()->is_var_shift() &&
 6766             VM_Version::supports_avx512bw());
 6767   match(Set dst ( LShiftVB src shift));
 6768   match(Set dst ( RShiftVB src shift));
 6769   match(Set dst (URShiftVB src shift));
 6770   effect(TEMP dst, TEMP vtmp);
 6771   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6772   ins_encode %{
 6773     assert(UseAVX > 2, "required");
 6774 
 6775     int opcode = this->ideal_Opcode();
 6776     int vlen_enc = vector_length_encoding(this);
 6777     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6778   %}
 6779   ins_pipe( pipe_slow );
 6780 %}
 6781 
 6782 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6783   predicate(Matcher::vector_length(n) == 64 &&
 6784             n->as_ShiftV()->is_var_shift() &&
 6785             VM_Version::supports_avx512bw());
 6786   match(Set dst ( LShiftVB src shift));
 6787   match(Set dst ( RShiftVB src shift));
 6788   match(Set dst (URShiftVB src shift));
 6789   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6790   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6791   ins_encode %{
 6792     assert(UseAVX > 2, "required");
 6793 
 6794     int opcode = this->ideal_Opcode();
 6795     int vlen_enc = Assembler::AVX_256bit;
 6796     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6797     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6798     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6799     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6800     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6801   %}
 6802   ins_pipe( pipe_slow );
 6803 %}
 6804 
 6805 // Short variable shift
 6806 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6807   predicate(Matcher::vector_length(n) <= 8 &&
 6808             n->as_ShiftV()->is_var_shift() &&
 6809             !VM_Version::supports_avx512bw());
 6810   match(Set dst ( LShiftVS src shift));
 6811   match(Set dst ( RShiftVS src shift));
 6812   match(Set dst (URShiftVS src shift));
 6813   effect(TEMP dst, TEMP vtmp);
 6814   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6815   ins_encode %{
 6816     assert(UseAVX >= 2, "required");
 6817 
 6818     int opcode = this->ideal_Opcode();
 6819     bool sign = (opcode != Op_URShiftVS);
 6820     int vlen_enc = Assembler::AVX_256bit;
 6821     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6822     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6823     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6824     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6825     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6826     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6827   %}
 6828   ins_pipe( pipe_slow );
 6829 %}
 6830 
 6831 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6832   predicate(Matcher::vector_length(n) == 16 &&
 6833             n->as_ShiftV()->is_var_shift() &&
 6834             !VM_Version::supports_avx512bw());
 6835   match(Set dst ( LShiftVS src shift));
 6836   match(Set dst ( RShiftVS src shift));
 6837   match(Set dst (URShiftVS src shift));
 6838   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6839   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6840   ins_encode %{
 6841     assert(UseAVX >= 2, "required");
 6842 
 6843     int opcode = this->ideal_Opcode();
 6844     bool sign = (opcode != Op_URShiftVS);
 6845     int vlen_enc = Assembler::AVX_256bit;
 6846     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6847     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6848     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6849     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6850     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6851 
 6852     // Shift upper half, with result in dst using vtmp1 as TEMP
 6853     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6854     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6855     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6856     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6857     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6858     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6859 
 6860     // Merge lower and upper half result into dst
 6861     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6862     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6863   %}
 6864   ins_pipe( pipe_slow );
 6865 %}
 6866 
 6867 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6868   predicate(n->as_ShiftV()->is_var_shift() &&
 6869             VM_Version::supports_avx512bw());
 6870   match(Set dst ( LShiftVS src shift));
 6871   match(Set dst ( RShiftVS src shift));
 6872   match(Set dst (URShiftVS src shift));
 6873   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6874   ins_encode %{
 6875     assert(UseAVX > 2, "required");
 6876 
 6877     int opcode = this->ideal_Opcode();
 6878     int vlen_enc = vector_length_encoding(this);
 6879     if (!VM_Version::supports_avx512vl()) {
 6880       vlen_enc = Assembler::AVX_512bit;
 6881     }
 6882     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6883   %}
 6884   ins_pipe( pipe_slow );
 6885 %}
 6886 
 6887 //Integer variable shift
 6888 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6889   predicate(n->as_ShiftV()->is_var_shift());
 6890   match(Set dst ( LShiftVI src shift));
 6891   match(Set dst ( RShiftVI src shift));
 6892   match(Set dst (URShiftVI src shift));
 6893   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6894   ins_encode %{
 6895     assert(UseAVX >= 2, "required");
 6896 
 6897     int opcode = this->ideal_Opcode();
 6898     int vlen_enc = vector_length_encoding(this);
 6899     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6900   %}
 6901   ins_pipe( pipe_slow );
 6902 %}
 6903 
 6904 //Long variable shift
 6905 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6906   predicate(n->as_ShiftV()->is_var_shift());
 6907   match(Set dst ( LShiftVL src shift));
 6908   match(Set dst (URShiftVL src shift));
 6909   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6910   ins_encode %{
 6911     assert(UseAVX >= 2, "required");
 6912 
 6913     int opcode = this->ideal_Opcode();
 6914     int vlen_enc = vector_length_encoding(this);
 6915     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6916   %}
 6917   ins_pipe( pipe_slow );
 6918 %}
 6919 
 6920 //Long variable right shift arithmetic
 6921 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6922   predicate(Matcher::vector_length(n) <= 4 &&
 6923             n->as_ShiftV()->is_var_shift() &&
 6924             UseAVX == 2);
 6925   match(Set dst (RShiftVL src shift));
 6926   effect(TEMP dst, TEMP vtmp);
 6927   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6928   ins_encode %{
 6929     int opcode = this->ideal_Opcode();
 6930     int vlen_enc = vector_length_encoding(this);
 6931     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 6932                  $vtmp$$XMMRegister);
 6933   %}
 6934   ins_pipe( pipe_slow );
 6935 %}
 6936 
 6937 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 6938   predicate(n->as_ShiftV()->is_var_shift() &&
 6939             UseAVX > 2);
 6940   match(Set dst (RShiftVL src shift));
 6941   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 6942   ins_encode %{
 6943     int opcode = this->ideal_Opcode();
 6944     int vlen_enc = vector_length_encoding(this);
 6945     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6946   %}
 6947   ins_pipe( pipe_slow );
 6948 %}
 6949 
 6950 // --------------------------------- AND --------------------------------------
 6951 
 6952 instruct vand(vec dst, vec src) %{
 6953   predicate(UseAVX == 0);
 6954   match(Set dst (AndV dst src));
 6955   format %{ "pand    $dst,$src\t! and vectors" %}
 6956   ins_encode %{
 6957     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 6958   %}
 6959   ins_pipe( pipe_slow );
 6960 %}
 6961 
 6962 instruct vand_reg(vec dst, vec src1, vec src2) %{
 6963   predicate(UseAVX > 0);
 6964   match(Set dst (AndV src1 src2));
 6965   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 6966   ins_encode %{
 6967     int vlen_enc = vector_length_encoding(this);
 6968     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6969   %}
 6970   ins_pipe( pipe_slow );
 6971 %}
 6972 
 6973 instruct vand_mem(vec dst, vec src, memory mem) %{
 6974   predicate((UseAVX > 0) &&
 6975             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6976   match(Set dst (AndV src (LoadVector mem)));
 6977   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 6978   ins_encode %{
 6979     int vlen_enc = vector_length_encoding(this);
 6980     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6981   %}
 6982   ins_pipe( pipe_slow );
 6983 %}
 6984 
 6985 // --------------------------------- OR ---------------------------------------
 6986 
 6987 instruct vor(vec dst, vec src) %{
 6988   predicate(UseAVX == 0);
 6989   match(Set dst (OrV dst src));
 6990   format %{ "por     $dst,$src\t! or vectors" %}
 6991   ins_encode %{
 6992     __ por($dst$$XMMRegister, $src$$XMMRegister);
 6993   %}
 6994   ins_pipe( pipe_slow );
 6995 %}
 6996 
 6997 instruct vor_reg(vec dst, vec src1, vec src2) %{
 6998   predicate(UseAVX > 0);
 6999   match(Set dst (OrV src1 src2));
 7000   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7001   ins_encode %{
 7002     int vlen_enc = vector_length_encoding(this);
 7003     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7004   %}
 7005   ins_pipe( pipe_slow );
 7006 %}
 7007 
 7008 instruct vor_mem(vec dst, vec src, memory mem) %{
 7009   predicate((UseAVX > 0) &&
 7010             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7011   match(Set dst (OrV src (LoadVector mem)));
 7012   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7013   ins_encode %{
 7014     int vlen_enc = vector_length_encoding(this);
 7015     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7016   %}
 7017   ins_pipe( pipe_slow );
 7018 %}
 7019 
 7020 // --------------------------------- XOR --------------------------------------
 7021 
 7022 instruct vxor(vec dst, vec src) %{
 7023   predicate(UseAVX == 0);
 7024   match(Set dst (XorV dst src));
 7025   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7026   ins_encode %{
 7027     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7028   %}
 7029   ins_pipe( pipe_slow );
 7030 %}
 7031 
 7032 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7033   predicate(UseAVX > 0);
 7034   match(Set dst (XorV src1 src2));
 7035   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7036   ins_encode %{
 7037     int vlen_enc = vector_length_encoding(this);
 7038     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7039   %}
 7040   ins_pipe( pipe_slow );
 7041 %}
 7042 
 7043 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7044   predicate((UseAVX > 0) &&
 7045             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7046   match(Set dst (XorV src (LoadVector mem)));
 7047   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7048   ins_encode %{
 7049     int vlen_enc = vector_length_encoding(this);
 7050     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7051   %}
 7052   ins_pipe( pipe_slow );
 7053 %}
 7054 
 7055 // --------------------------------- VectorCast --------------------------------------
 7056 
 7057 instruct vcastBtoX(vec dst, vec src) %{
 7058   match(Set dst (VectorCastB2X src));
 7059   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7060   ins_encode %{
 7061     assert(UseAVX > 0, "required");
 7062 
 7063     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7064     int vlen_enc = vector_length_encoding(this);
 7065     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7066   %}
 7067   ins_pipe( pipe_slow );
 7068 %}
 7069 
 7070 instruct castStoX(vec dst, vec src) %{
 7071   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7072             Matcher::vector_length(n->in(1)) <= 8 && // src
 7073             Matcher::vector_element_basic_type(n) == T_BYTE);
 7074   match(Set dst (VectorCastS2X src));
 7075   format %{ "vector_cast_s2x $dst,$src" %}
 7076   ins_encode %{
 7077     assert(UseAVX > 0, "required");
 7078 
 7079     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7080     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7081   %}
 7082   ins_pipe( pipe_slow );
 7083 %}
 7084 
 7085 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7086   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7087             Matcher::vector_length(n->in(1)) == 16 && // src
 7088             Matcher::vector_element_basic_type(n) == T_BYTE);
 7089   effect(TEMP dst, TEMP vtmp);
 7090   match(Set dst (VectorCastS2X src));
 7091   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7092   ins_encode %{
 7093     assert(UseAVX > 0, "required");
 7094 
 7095     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7096     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7097     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7098     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7099   %}
 7100   ins_pipe( pipe_slow );
 7101 %}
 7102 
 7103 instruct vcastStoX_evex(vec dst, vec src) %{
 7104   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7105             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7106   match(Set dst (VectorCastS2X src));
 7107   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7108   ins_encode %{
 7109     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7110     int src_vlen_enc = vector_length_encoding(this, $src);
 7111     int vlen_enc = vector_length_encoding(this);
 7112     switch (to_elem_bt) {
 7113       case T_BYTE:
 7114         if (!VM_Version::supports_avx512vl()) {
 7115           vlen_enc = Assembler::AVX_512bit;
 7116         }
 7117         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7118         break;
 7119       case T_INT:
 7120         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7121         break;
 7122       case T_FLOAT:
 7123         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7124         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7125         break;
 7126       case T_LONG:
 7127         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7128         break;
 7129       case T_DOUBLE: {
 7130         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7131         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7132         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7133         break;
 7134       }
 7135       default:
 7136         ShouldNotReachHere();
 7137     }
 7138   %}
 7139   ins_pipe( pipe_slow );
 7140 %}
 7141 
 7142 instruct castItoX(vec dst, vec src) %{
 7143   predicate(UseAVX <= 2 &&
 7144             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7145             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7146   match(Set dst (VectorCastI2X src));
 7147   format %{ "vector_cast_i2x $dst,$src" %}
 7148   ins_encode %{
 7149     assert(UseAVX > 0, "required");
 7150 
 7151     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7152     int vlen_enc = vector_length_encoding(this, $src);
 7153 
 7154     if (to_elem_bt == T_BYTE) {
 7155       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7156       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7157       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7158     } else {
 7159       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7160       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7161       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7162     }
 7163   %}
 7164   ins_pipe( pipe_slow );
 7165 %}
 7166 
 7167 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7168   predicate(UseAVX <= 2 &&
 7169             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7170             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7171   match(Set dst (VectorCastI2X src));
 7172   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7173   effect(TEMP dst, TEMP vtmp);
 7174   ins_encode %{
 7175     assert(UseAVX > 0, "required");
 7176 
 7177     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7178     int vlen_enc = vector_length_encoding(this, $src);
 7179 
 7180     if (to_elem_bt == T_BYTE) {
 7181       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7182       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7183       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7184       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7185     } else {
 7186       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7187       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7188       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7189       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7190     }
 7191   %}
 7192   ins_pipe( pipe_slow );
 7193 %}
 7194 
 7195 instruct vcastItoX_evex(vec dst, vec src) %{
 7196   predicate(UseAVX > 2 ||
 7197             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7198   match(Set dst (VectorCastI2X src));
 7199   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7200   ins_encode %{
 7201     assert(UseAVX > 0, "required");
 7202 
 7203     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7204     int src_vlen_enc = vector_length_encoding(this, $src);
 7205     int dst_vlen_enc = vector_length_encoding(this);
 7206     switch (dst_elem_bt) {
 7207       case T_BYTE:
 7208         if (!VM_Version::supports_avx512vl()) {
 7209           src_vlen_enc = Assembler::AVX_512bit;
 7210         }
 7211         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7212         break;
 7213       case T_SHORT:
 7214         if (!VM_Version::supports_avx512vl()) {
 7215           src_vlen_enc = Assembler::AVX_512bit;
 7216         }
 7217         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7218         break;
 7219       case T_FLOAT:
 7220         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7221         break;
 7222       case T_LONG:
 7223         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7224         break;
 7225       case T_DOUBLE:
 7226         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7227         break;
 7228       default:
 7229         ShouldNotReachHere();
 7230     }
 7231   %}
 7232   ins_pipe( pipe_slow );
 7233 %}
 7234 
 7235 instruct vcastLtoBS(vec dst, vec src) %{
 7236   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7237             UseAVX <= 2);
 7238   match(Set dst (VectorCastL2X src));
 7239   format %{ "vector_cast_l2x  $dst,$src" %}
 7240   ins_encode %{
 7241     assert(UseAVX > 0, "required");
 7242 
 7243     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7244     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7245     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7246                                                       : ExternalAddress(vector_int_to_short_mask());
 7247     if (vlen <= 16) {
 7248       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7249       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7250       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7251     } else {
 7252       assert(vlen <= 32, "required");
 7253       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7254       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7255       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7256       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7257     }
 7258     if (to_elem_bt == T_BYTE) {
 7259       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7260     }
 7261   %}
 7262   ins_pipe( pipe_slow );
 7263 %}
 7264 
 7265 instruct vcastLtoX_evex(vec dst, vec src) %{
 7266   predicate(UseAVX > 2 ||
 7267             (Matcher::vector_element_basic_type(n) == T_INT ||
 7268              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7269              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7270   match(Set dst (VectorCastL2X src));
 7271   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7272   ins_encode %{
 7273     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7274     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7275     int vlen_enc = vector_length_encoding(this, $src);
 7276     switch (to_elem_bt) {
 7277       case T_BYTE:
 7278         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7279           vlen_enc = Assembler::AVX_512bit;
 7280         }
 7281         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7282         break;
 7283       case T_SHORT:
 7284         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7285           vlen_enc = Assembler::AVX_512bit;
 7286         }
 7287         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7288         break;
 7289       case T_INT:
 7290         if (vlen == 8) {
 7291           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7292             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7293           }
 7294         } else if (vlen == 16) {
 7295           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7296         } else if (vlen == 32) {
 7297           if (UseAVX > 2) {
 7298             if (!VM_Version::supports_avx512vl()) {
 7299               vlen_enc = Assembler::AVX_512bit;
 7300             }
 7301             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7302           } else {
 7303             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7304             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7305           }
 7306         } else { // vlen == 64
 7307           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7308         }
 7309         break;
 7310       case T_FLOAT:
 7311         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7312         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7313         break;
 7314       case T_DOUBLE:
 7315         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7316         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7317         break;
 7318 
 7319       default: assert(false, "%s", type2name(to_elem_bt));
 7320     }
 7321   %}
 7322   ins_pipe( pipe_slow );
 7323 %}
 7324 
 7325 instruct vcastFtoD_reg(vec dst, vec src) %{
 7326   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7327   match(Set dst (VectorCastF2X src));
 7328   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7329   ins_encode %{
 7330     int vlen_enc = vector_length_encoding(this);
 7331     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7332   %}
 7333   ins_pipe( pipe_slow );
 7334 %}
 7335 
 7336 
 7337 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7338   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7339             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7340   match(Set dst (VectorCastF2X src));
 7341   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7342   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7343   ins_encode %{
 7344     int vlen_enc = vector_length_encoding(this, $src);
 7345     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7346     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7347     // 32 bit addresses for register indirect addressing mode since stub constants
 7348     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7349     // However, targets are free to increase this limit, but having a large code cache size
 7350     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7351     // cap we save a temporary register allocation which in limiting case can prevent
 7352     // spilling in high register pressure blocks.
 7353     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7354                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7355                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7356   %}
 7357   ins_pipe( pipe_slow );
 7358 %}
 7359 
 7360 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7361   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7362             is_integral_type(Matcher::vector_element_basic_type(n)));
 7363   match(Set dst (VectorCastF2X src));
 7364   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7365   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7366   ins_encode %{
 7367     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7368     if (to_elem_bt == T_LONG) {
 7369       int vlen_enc = vector_length_encoding(this);
 7370       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7371                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7372                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7373     } else {
 7374       int vlen_enc = vector_length_encoding(this, $src);
 7375       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7376                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7377                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7378     }
 7379   %}
 7380   ins_pipe( pipe_slow );
 7381 %}
 7382 
 7383 instruct vcastDtoF_reg(vec dst, vec src) %{
 7384   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7385   match(Set dst (VectorCastD2X src));
 7386   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7387   ins_encode %{
 7388     int vlen_enc = vector_length_encoding(this, $src);
 7389     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7390   %}
 7391   ins_pipe( pipe_slow );
 7392 %}
 7393 
 7394 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7395   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7396             is_integral_type(Matcher::vector_element_basic_type(n)));
 7397   match(Set dst (VectorCastD2X src));
 7398   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7399   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7400   ins_encode %{
 7401     int vlen_enc = vector_length_encoding(this, $src);
 7402     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7403     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7404                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7405                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7406   %}
 7407   ins_pipe( pipe_slow );
 7408 %}
 7409 
 7410 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7411   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7412             is_integral_type(Matcher::vector_element_basic_type(n)));
 7413   match(Set dst (VectorCastD2X src));
 7414   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7415   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7416   ins_encode %{
 7417     int vlen_enc = vector_length_encoding(this, $src);
 7418     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7419     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7420                               ExternalAddress(vector_float_signflip());
 7421     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7422                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7423   %}
 7424   ins_pipe( pipe_slow );
 7425 %}
 7426 
 7427 instruct vucast(vec dst, vec src) %{
 7428   match(Set dst (VectorUCastB2X src));
 7429   match(Set dst (VectorUCastS2X src));
 7430   match(Set dst (VectorUCastI2X src));
 7431   format %{ "vector_ucast $dst,$src\t!" %}
 7432   ins_encode %{
 7433     assert(UseAVX > 0, "required");
 7434 
 7435     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7436     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7437     int vlen_enc = vector_length_encoding(this);
 7438     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7439   %}
 7440   ins_pipe( pipe_slow );
 7441 %}
 7442 
 7443 #ifdef _LP64
 7444 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7445   predicate(!VM_Version::supports_avx512vl() &&
 7446             Matcher::vector_length_in_bytes(n) < 64 &&
 7447             Matcher::vector_element_basic_type(n) == T_INT);
 7448   match(Set dst (RoundVF src));
 7449   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7450   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7451   ins_encode %{
 7452     int vlen_enc = vector_length_encoding(this);
 7453     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7454     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7455                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7456                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7457   %}
 7458   ins_pipe( pipe_slow );
 7459 %}
 7460 
 7461 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7462   predicate((VM_Version::supports_avx512vl() ||
 7463              Matcher::vector_length_in_bytes(n) == 64) &&
 7464              Matcher::vector_element_basic_type(n) == T_INT);
 7465   match(Set dst (RoundVF src));
 7466   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7467   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7468   ins_encode %{
 7469     int vlen_enc = vector_length_encoding(this);
 7470     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7471     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7472                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7473                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7474   %}
 7475   ins_pipe( pipe_slow );
 7476 %}
 7477 
 7478 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7479   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7480   match(Set dst (RoundVD src));
 7481   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7482   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7483   ins_encode %{
 7484     int vlen_enc = vector_length_encoding(this);
 7485     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7486     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7487                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7488                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7489   %}
 7490   ins_pipe( pipe_slow );
 7491 %}
 7492 
 7493 #endif // _LP64
 7494 
 7495 // --------------------------------- VectorMaskCmp --------------------------------------
 7496 
 7497 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7498   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7499             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7500             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7501             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7502   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7503   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7504   ins_encode %{
 7505     int vlen_enc = vector_length_encoding(this, $src1);
 7506     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7507     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7508       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7509     } else {
 7510       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7511     }
 7512   %}
 7513   ins_pipe( pipe_slow );
 7514 %}
 7515 
 7516 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7517   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7518             n->bottom_type()->isa_vectmask() == nullptr &&
 7519             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7520   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7521   effect(TEMP ktmp);
 7522   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7523   ins_encode %{
 7524     int vlen_enc = Assembler::AVX_512bit;
 7525     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7526     KRegister mask = k0; // The comparison itself is not being masked.
 7527     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7528       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7529       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7530     } else {
 7531       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7532       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7533     }
 7534   %}
 7535   ins_pipe( pipe_slow );
 7536 %}
 7537 
 7538 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7539   predicate(n->bottom_type()->isa_vectmask() &&
 7540             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7541   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7542   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7543   ins_encode %{
 7544     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7545     int vlen_enc = vector_length_encoding(this, $src1);
 7546     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7547     KRegister mask = k0; // The comparison itself is not being masked.
 7548     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7549       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7550     } else {
 7551       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7552     }
 7553   %}
 7554   ins_pipe( pipe_slow );
 7555 %}
 7556 
 7557 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7558   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7559             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7560             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7561             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7562             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7563             (n->in(2)->get_int() == BoolTest::eq ||
 7564              n->in(2)->get_int() == BoolTest::lt ||
 7565              n->in(2)->get_int() == BoolTest::gt)); // cond
 7566   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7567   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7568   ins_encode %{
 7569     int vlen_enc = vector_length_encoding(this, $src1);
 7570     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7571     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7572     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7573   %}
 7574   ins_pipe( pipe_slow );
 7575 %}
 7576 
 7577 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7578   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7579             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7580             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7581             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7582             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7583             (n->in(2)->get_int() == BoolTest::ne ||
 7584              n->in(2)->get_int() == BoolTest::le ||
 7585              n->in(2)->get_int() == BoolTest::ge)); // cond
 7586   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7587   effect(TEMP dst, TEMP xtmp);
 7588   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7589   ins_encode %{
 7590     int vlen_enc = vector_length_encoding(this, $src1);
 7591     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7592     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7593     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7594   %}
 7595   ins_pipe( pipe_slow );
 7596 %}
 7597 
 7598 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7599   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7600             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7601             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7602             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7603             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7604   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7605   effect(TEMP dst, TEMP xtmp);
 7606   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7607   ins_encode %{
 7608     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7609     int vlen_enc = vector_length_encoding(this, $src1);
 7610     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7611     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7612 
 7613     if (vlen_enc == Assembler::AVX_128bit) {
 7614       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7615     } else {
 7616       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7617     }
 7618     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7619     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7620     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7621   %}
 7622   ins_pipe( pipe_slow );
 7623 %}
 7624 
 7625 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7626   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7627              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7628              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7629   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7630   effect(TEMP ktmp);
 7631   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7632   ins_encode %{
 7633     assert(UseAVX > 2, "required");
 7634 
 7635     int vlen_enc = vector_length_encoding(this, $src1);
 7636     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7637     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7638     KRegister mask = k0; // The comparison itself is not being masked.
 7639     bool merge = false;
 7640     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7641 
 7642     switch (src1_elem_bt) {
 7643       case T_INT: {
 7644         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7645         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7646         break;
 7647       }
 7648       case T_LONG: {
 7649         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7650         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7651         break;
 7652       }
 7653       default: assert(false, "%s", type2name(src1_elem_bt));
 7654     }
 7655   %}
 7656   ins_pipe( pipe_slow );
 7657 %}
 7658 
 7659 
 7660 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7661   predicate(n->bottom_type()->isa_vectmask() &&
 7662             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7663   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7664   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7665   ins_encode %{
 7666     assert(UseAVX > 2, "required");
 7667     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7668 
 7669     int vlen_enc = vector_length_encoding(this, $src1);
 7670     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7671     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7672     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7673 
 7674     // Comparison i
 7675     switch (src1_elem_bt) {
 7676       case T_BYTE: {
 7677         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7678         break;
 7679       }
 7680       case T_SHORT: {
 7681         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7682         break;
 7683       }
 7684       case T_INT: {
 7685         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7686         break;
 7687       }
 7688       case T_LONG: {
 7689         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7690         break;
 7691       }
 7692       default: assert(false, "%s", type2name(src1_elem_bt));
 7693     }
 7694   %}
 7695   ins_pipe( pipe_slow );
 7696 %}
 7697 
 7698 // Extract
 7699 
 7700 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7701   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7702   match(Set dst (ExtractI src idx));
 7703   match(Set dst (ExtractS src idx));
 7704 #ifdef _LP64
 7705   match(Set dst (ExtractB src idx));
 7706 #endif
 7707   format %{ "extractI $dst,$src,$idx\t!" %}
 7708   ins_encode %{
 7709     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7710 
 7711     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7712     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7713   %}
 7714   ins_pipe( pipe_slow );
 7715 %}
 7716 
 7717 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7718   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7719             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7720   match(Set dst (ExtractI src idx));
 7721   match(Set dst (ExtractS src idx));
 7722 #ifdef _LP64
 7723   match(Set dst (ExtractB src idx));
 7724 #endif
 7725   effect(TEMP vtmp);
 7726   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7727   ins_encode %{
 7728     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7729 
 7730     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7731     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7732     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7733   %}
 7734   ins_pipe( pipe_slow );
 7735 %}
 7736 
 7737 #ifdef _LP64
 7738 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7739   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7740   match(Set dst (ExtractL src idx));
 7741   format %{ "extractL $dst,$src,$idx\t!" %}
 7742   ins_encode %{
 7743     assert(UseSSE >= 4, "required");
 7744     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7745 
 7746     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7747   %}
 7748   ins_pipe( pipe_slow );
 7749 %}
 7750 
 7751 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7752   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7753             Matcher::vector_length(n->in(1)) == 8);  // src
 7754   match(Set dst (ExtractL src idx));
 7755   effect(TEMP vtmp);
 7756   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7757   ins_encode %{
 7758     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7759 
 7760     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7761     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7762   %}
 7763   ins_pipe( pipe_slow );
 7764 %}
 7765 #endif
 7766 
 7767 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7768   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7769   match(Set dst (ExtractF src idx));
 7770   effect(TEMP dst, TEMP vtmp);
 7771   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7772   ins_encode %{
 7773     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7774 
 7775     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7776   %}
 7777   ins_pipe( pipe_slow );
 7778 %}
 7779 
 7780 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7781   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7782             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7783   match(Set dst (ExtractF src idx));
 7784   effect(TEMP vtmp);
 7785   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7786   ins_encode %{
 7787     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7788 
 7789     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7790     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7791   %}
 7792   ins_pipe( pipe_slow );
 7793 %}
 7794 
 7795 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7796   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7797   match(Set dst (ExtractD src idx));
 7798   format %{ "extractD $dst,$src,$idx\t!" %}
 7799   ins_encode %{
 7800     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7801 
 7802     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7803   %}
 7804   ins_pipe( pipe_slow );
 7805 %}
 7806 
 7807 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7808   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7809             Matcher::vector_length(n->in(1)) == 8);  // src
 7810   match(Set dst (ExtractD src idx));
 7811   effect(TEMP vtmp);
 7812   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7813   ins_encode %{
 7814     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7815 
 7816     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7817     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7818   %}
 7819   ins_pipe( pipe_slow );
 7820 %}
 7821 
 7822 // --------------------------------- Vector Blend --------------------------------------
 7823 
 7824 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7825   predicate(UseAVX == 0);
 7826   match(Set dst (VectorBlend (Binary dst src) mask));
 7827   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7828   effect(TEMP tmp);
 7829   ins_encode %{
 7830     assert(UseSSE >= 4, "required");
 7831 
 7832     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7833       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7834     }
 7835     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7836   %}
 7837   ins_pipe( pipe_slow );
 7838 %}
 7839 
 7840 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7841   predicate(UseAVX > 0 &&
 7842             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7843             Matcher::vector_length_in_bytes(n) <= 32 &&
 7844             is_integral_type(Matcher::vector_element_basic_type(n)));
 7845   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7846   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7847   ins_encode %{
 7848     int vlen_enc = vector_length_encoding(this);
 7849     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7850   %}
 7851   ins_pipe( pipe_slow );
 7852 %}
 7853 
 7854 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7855   predicate(UseAVX > 0 &&
 7856             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7857             Matcher::vector_length_in_bytes(n) <= 32 &&
 7858             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7859   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7860   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7861   ins_encode %{
 7862     int vlen_enc = vector_length_encoding(this);
 7863     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7864   %}
 7865   ins_pipe( pipe_slow );
 7866 %}
 7867 
 7868 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7869   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7870             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 7871   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7872   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7873   effect(TEMP ktmp);
 7874   ins_encode %{
 7875      int vlen_enc = Assembler::AVX_512bit;
 7876      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7877     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7878     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7879   %}
 7880   ins_pipe( pipe_slow );
 7881 %}
 7882 
 7883 
 7884 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7885   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7886             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7887              VM_Version::supports_avx512bw()));
 7888   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7889   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7890   ins_encode %{
 7891     int vlen_enc = vector_length_encoding(this);
 7892     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7893     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7894   %}
 7895   ins_pipe( pipe_slow );
 7896 %}
 7897 
 7898 // --------------------------------- ABS --------------------------------------
 7899 // a = |a|
 7900 instruct vabsB_reg(vec dst, vec src) %{
 7901   match(Set dst (AbsVB  src));
 7902   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7903   ins_encode %{
 7904     uint vlen = Matcher::vector_length(this);
 7905     if (vlen <= 16) {
 7906       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7907     } else {
 7908       int vlen_enc = vector_length_encoding(this);
 7909       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7910     }
 7911   %}
 7912   ins_pipe( pipe_slow );
 7913 %}
 7914 
 7915 instruct vabsS_reg(vec dst, vec src) %{
 7916   match(Set dst (AbsVS  src));
 7917   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7918   ins_encode %{
 7919     uint vlen = Matcher::vector_length(this);
 7920     if (vlen <= 8) {
 7921       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7922     } else {
 7923       int vlen_enc = vector_length_encoding(this);
 7924       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7925     }
 7926   %}
 7927   ins_pipe( pipe_slow );
 7928 %}
 7929 
 7930 instruct vabsI_reg(vec dst, vec src) %{
 7931   match(Set dst (AbsVI  src));
 7932   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 7933   ins_encode %{
 7934     uint vlen = Matcher::vector_length(this);
 7935     if (vlen <= 4) {
 7936       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 7937     } else {
 7938       int vlen_enc = vector_length_encoding(this);
 7939       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7940     }
 7941   %}
 7942   ins_pipe( pipe_slow );
 7943 %}
 7944 
 7945 instruct vabsL_reg(vec dst, vec src) %{
 7946   match(Set dst (AbsVL  src));
 7947   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 7948   ins_encode %{
 7949     assert(UseAVX > 2, "required");
 7950     int vlen_enc = vector_length_encoding(this);
 7951     if (!VM_Version::supports_avx512vl()) {
 7952       vlen_enc = Assembler::AVX_512bit;
 7953     }
 7954     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7955   %}
 7956   ins_pipe( pipe_slow );
 7957 %}
 7958 
 7959 // --------------------------------- ABSNEG --------------------------------------
 7960 
 7961 instruct vabsnegF(vec dst, vec src) %{
 7962   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 7963   match(Set dst (AbsVF src));
 7964   match(Set dst (NegVF src));
 7965   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 7966   ins_cost(150);
 7967   ins_encode %{
 7968     int opcode = this->ideal_Opcode();
 7969     int vlen = Matcher::vector_length(this);
 7970     if (vlen == 2) {
 7971       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 7972     } else {
 7973       assert(vlen == 8 || vlen == 16, "required");
 7974       int vlen_enc = vector_length_encoding(this);
 7975       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7976     }
 7977   %}
 7978   ins_pipe( pipe_slow );
 7979 %}
 7980 
 7981 instruct vabsneg4F(vec dst) %{
 7982   predicate(Matcher::vector_length(n) == 4);
 7983   match(Set dst (AbsVF dst));
 7984   match(Set dst (NegVF dst));
 7985   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 7986   ins_cost(150);
 7987   ins_encode %{
 7988     int opcode = this->ideal_Opcode();
 7989     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 7990   %}
 7991   ins_pipe( pipe_slow );
 7992 %}
 7993 
 7994 instruct vabsnegD(vec dst, vec src) %{
 7995   match(Set dst (AbsVD  src));
 7996   match(Set dst (NegVD  src));
 7997   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 7998   ins_encode %{
 7999     int opcode = this->ideal_Opcode();
 8000     uint vlen = Matcher::vector_length(this);
 8001     if (vlen == 2) {
 8002       assert(UseSSE >= 2, "required");
 8003       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8004     } else {
 8005       int vlen_enc = vector_length_encoding(this);
 8006       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8007     }
 8008   %}
 8009   ins_pipe( pipe_slow );
 8010 %}
 8011 
 8012 //------------------------------------- VectorTest --------------------------------------------
 8013 
 8014 #ifdef _LP64
 8015 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8016   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8017   match(Set cr (VectorTest src1 src2));
 8018   effect(TEMP vtmp);
 8019   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8020   ins_encode %{
 8021     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8022     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8023     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8024   %}
 8025   ins_pipe( pipe_slow );
 8026 %}
 8027 
 8028 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8029   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8030   match(Set cr (VectorTest src1 src2));
 8031   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8032   ins_encode %{
 8033     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8034     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8035     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8036   %}
 8037   ins_pipe( pipe_slow );
 8038 %}
 8039 
 8040 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8041   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8042              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8043             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8044   match(Set cr (VectorTest src1 src2));
 8045   effect(TEMP tmp);
 8046   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8047   ins_encode %{
 8048     uint masklen = Matcher::vector_length(this, $src1);
 8049     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8050     __ andl($tmp$$Register, (1 << masklen) - 1);
 8051     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8052   %}
 8053   ins_pipe( pipe_slow );
 8054 %}
 8055 
 8056 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8057   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8058              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8059             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8060   match(Set cr (VectorTest src1 src2));
 8061   effect(TEMP tmp);
 8062   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8063   ins_encode %{
 8064     uint masklen = Matcher::vector_length(this, $src1);
 8065     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8066     __ andl($tmp$$Register, (1 << masklen) - 1);
 8067   %}
 8068   ins_pipe( pipe_slow );
 8069 %}
 8070 
 8071 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8072   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8073             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8074   match(Set cr (VectorTest src1 src2));
 8075   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8076   ins_encode %{
 8077     uint masklen = Matcher::vector_length(this, $src1);
 8078     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8079   %}
 8080   ins_pipe( pipe_slow );
 8081 %}
 8082 #endif
 8083 
 8084 //------------------------------------- LoadMask --------------------------------------------
 8085 
 8086 instruct loadMask(legVec dst, legVec src) %{
 8087   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8088   match(Set dst (VectorLoadMask src));
 8089   effect(TEMP dst);
 8090   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8091   ins_encode %{
 8092     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8093     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8094     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8095   %}
 8096   ins_pipe( pipe_slow );
 8097 %}
 8098 
 8099 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8100   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8101   match(Set dst (VectorLoadMask src));
 8102   effect(TEMP xtmp);
 8103   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8104   ins_encode %{
 8105     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8106                         true, Assembler::AVX_512bit);
 8107   %}
 8108   ins_pipe( pipe_slow );
 8109 %}
 8110 
 8111 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8112   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8113   match(Set dst (VectorLoadMask src));
 8114   effect(TEMP xtmp);
 8115   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8116   ins_encode %{
 8117     int vlen_enc = vector_length_encoding(in(1));
 8118     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8119                         false, vlen_enc);
 8120   %}
 8121   ins_pipe( pipe_slow );
 8122 %}
 8123 
 8124 //------------------------------------- StoreMask --------------------------------------------
 8125 
 8126 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8127   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8128   match(Set dst (VectorStoreMask src size));
 8129   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8130   ins_encode %{
 8131     int vlen = Matcher::vector_length(this);
 8132     if (vlen <= 16 && UseAVX <= 2) {
 8133       assert(UseSSE >= 3, "required");
 8134       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8135     } else {
 8136       assert(UseAVX > 0, "required");
 8137       int src_vlen_enc = vector_length_encoding(this, $src);
 8138       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8139     }
 8140   %}
 8141   ins_pipe( pipe_slow );
 8142 %}
 8143 
 8144 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8145   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8146   match(Set dst (VectorStoreMask src size));
 8147   effect(TEMP_DEF dst, TEMP xtmp);
 8148   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8149   ins_encode %{
 8150     int vlen_enc = Assembler::AVX_128bit;
 8151     int vlen = Matcher::vector_length(this);
 8152     if (vlen <= 8) {
 8153       assert(UseSSE >= 3, "required");
 8154       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8155       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8156       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8157     } else {
 8158       assert(UseAVX > 0, "required");
 8159       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8160       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8161       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8162     }
 8163   %}
 8164   ins_pipe( pipe_slow );
 8165 %}
 8166 
 8167 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8168   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8169   match(Set dst (VectorStoreMask src size));
 8170   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8171   effect(TEMP_DEF dst, TEMP xtmp);
 8172   ins_encode %{
 8173     int vlen_enc = Assembler::AVX_128bit;
 8174     int vlen = Matcher::vector_length(this);
 8175     if (vlen <= 4) {
 8176       assert(UseSSE >= 3, "required");
 8177       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8178       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8179       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8180       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8181     } else {
 8182       assert(UseAVX > 0, "required");
 8183       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8184       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8185       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8186       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8187       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8188     }
 8189   %}
 8190   ins_pipe( pipe_slow );
 8191 %}
 8192 
 8193 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8194   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8195   match(Set dst (VectorStoreMask src size));
 8196   effect(TEMP_DEF dst, TEMP xtmp);
 8197   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8198   ins_encode %{
 8199     assert(UseSSE >= 3, "required");
 8200     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8201     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8202     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8203     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8204     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8205   %}
 8206   ins_pipe( pipe_slow );
 8207 %}
 8208 
 8209 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8210   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8211   match(Set dst (VectorStoreMask src size));
 8212   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8213   effect(TEMP_DEF dst, TEMP vtmp);
 8214   ins_encode %{
 8215     int vlen_enc = Assembler::AVX_128bit;
 8216     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8217     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8218     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8219     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8220     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8221     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8222     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8223   %}
 8224   ins_pipe( pipe_slow );
 8225 %}
 8226 
 8227 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8228   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8229   match(Set dst (VectorStoreMask src size));
 8230   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8231   ins_encode %{
 8232     int src_vlen_enc = vector_length_encoding(this, $src);
 8233     int dst_vlen_enc = vector_length_encoding(this);
 8234     if (!VM_Version::supports_avx512vl()) {
 8235       src_vlen_enc = Assembler::AVX_512bit;
 8236     }
 8237     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8238     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8239   %}
 8240   ins_pipe( pipe_slow );
 8241 %}
 8242 
 8243 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8244   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8245   match(Set dst (VectorStoreMask src size));
 8246   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8247   ins_encode %{
 8248     int src_vlen_enc = vector_length_encoding(this, $src);
 8249     int dst_vlen_enc = vector_length_encoding(this);
 8250     if (!VM_Version::supports_avx512vl()) {
 8251       src_vlen_enc = Assembler::AVX_512bit;
 8252     }
 8253     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8254     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8255   %}
 8256   ins_pipe( pipe_slow );
 8257 %}
 8258 
 8259 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8260   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8261   match(Set dst (VectorStoreMask mask size));
 8262   effect(TEMP_DEF dst);
 8263   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8264   ins_encode %{
 8265     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8266     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8267                  false, Assembler::AVX_512bit, noreg);
 8268     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8269   %}
 8270   ins_pipe( pipe_slow );
 8271 %}
 8272 
 8273 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8274   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8275   match(Set dst (VectorStoreMask mask size));
 8276   effect(TEMP_DEF dst);
 8277   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8278   ins_encode %{
 8279     int dst_vlen_enc = vector_length_encoding(this);
 8280     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8281     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8282   %}
 8283   ins_pipe( pipe_slow );
 8284 %}
 8285 
 8286 instruct vmaskcast_evex(kReg dst) %{
 8287   match(Set dst (VectorMaskCast dst));
 8288   ins_cost(0);
 8289   format %{ "vector_mask_cast $dst" %}
 8290   ins_encode %{
 8291     // empty
 8292   %}
 8293   ins_pipe(empty);
 8294 %}
 8295 
 8296 instruct vmaskcast(vec dst) %{
 8297   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8298   match(Set dst (VectorMaskCast dst));
 8299   ins_cost(0);
 8300   format %{ "vector_mask_cast $dst" %}
 8301   ins_encode %{
 8302     // empty
 8303   %}
 8304   ins_pipe(empty);
 8305 %}
 8306 
 8307 instruct vmaskcast_avx(vec dst, vec src) %{
 8308   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8309   match(Set dst (VectorMaskCast src));
 8310   format %{ "vector_mask_cast $dst, $src" %}
 8311   ins_encode %{
 8312     int vlen = Matcher::vector_length(this);
 8313     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8314     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8315     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8316   %}
 8317   ins_pipe(pipe_slow);
 8318 %}
 8319 
 8320 //-------------------------------- Load Iota Indices ----------------------------------
 8321 
 8322 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8323   match(Set dst (VectorLoadConst src));
 8324   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8325   ins_encode %{
 8326      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8327      BasicType bt = Matcher::vector_element_basic_type(this);
 8328      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8329   %}
 8330   ins_pipe( pipe_slow );
 8331 %}
 8332 
 8333 #ifdef _LP64
 8334 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8335   match(Set dst (PopulateIndex src1 src2));
 8336   effect(TEMP dst, TEMP vtmp);
 8337   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8338   ins_encode %{
 8339      assert($src2$$constant == 1, "required");
 8340      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8341      int vlen_enc = vector_length_encoding(this);
 8342      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8343      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8344      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8345      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8346   %}
 8347   ins_pipe( pipe_slow );
 8348 %}
 8349 
 8350 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8351   match(Set dst (PopulateIndex src1 src2));
 8352   effect(TEMP dst, TEMP vtmp);
 8353   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8354   ins_encode %{
 8355      assert($src2$$constant == 1, "required");
 8356      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8357      int vlen_enc = vector_length_encoding(this);
 8358      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8359      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8360      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8361      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8362   %}
 8363   ins_pipe( pipe_slow );
 8364 %}
 8365 #endif
 8366 //-------------------------------- Rearrange ----------------------------------
 8367 
 8368 // LoadShuffle/Rearrange for Byte
 8369 
 8370 instruct loadShuffleB(vec dst) %{
 8371   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8372   match(Set dst (VectorLoadShuffle dst));
 8373   format %{ "vector_load_shuffle $dst, $dst" %}
 8374   ins_encode %{
 8375     // empty
 8376   %}
 8377   ins_pipe( pipe_slow );
 8378 %}
 8379 
 8380 instruct rearrangeB(vec dst, vec shuffle) %{
 8381   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8382             Matcher::vector_length(n) < 32);
 8383   match(Set dst (VectorRearrange dst shuffle));
 8384   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8385   ins_encode %{
 8386     assert(UseSSE >= 4, "required");
 8387     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8388   %}
 8389   ins_pipe( pipe_slow );
 8390 %}
 8391 
 8392 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8393   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8394             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8395   match(Set dst (VectorRearrange src shuffle));
 8396   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8397   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8398   ins_encode %{
 8399     assert(UseAVX >= 2, "required");
 8400     // Swap src into vtmp1
 8401     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8402     // Shuffle swapped src to get entries from other 128 bit lane
 8403     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8404     // Shuffle original src to get entries from self 128 bit lane
 8405     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8406     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8407     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8408     // Perform the blend
 8409     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8410   %}
 8411   ins_pipe( pipe_slow );
 8412 %}
 8413 
 8414 
 8415 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8416   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8417             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8418   match(Set dst (VectorRearrange src shuffle));
 8419   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8420   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8421   ins_encode %{
 8422     int vlen_enc = vector_length_encoding(this);
 8423     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8424                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8425                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8426   %}
 8427   ins_pipe( pipe_slow );
 8428 %}
 8429 
 8430 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8431   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8432             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8433   match(Set dst (VectorRearrange src shuffle));
 8434   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8435   ins_encode %{
 8436     int vlen_enc = vector_length_encoding(this);
 8437     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8438   %}
 8439   ins_pipe( pipe_slow );
 8440 %}
 8441 
 8442 // LoadShuffle/Rearrange for Short
 8443 
 8444 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8445   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8446             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8447   match(Set dst (VectorLoadShuffle src));
 8448   effect(TEMP dst, TEMP vtmp);
 8449   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8450   ins_encode %{
 8451     // Create a byte shuffle mask from short shuffle mask
 8452     // only byte shuffle instruction available on these platforms
 8453     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8454     if (UseAVX == 0) {
 8455       assert(vlen_in_bytes <= 16, "required");
 8456       // Multiply each shuffle by two to get byte index
 8457       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8458       __ psllw($vtmp$$XMMRegister, 1);
 8459 
 8460       // Duplicate to create 2 copies of byte index
 8461       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8462       __ psllw($dst$$XMMRegister, 8);
 8463       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8464 
 8465       // Add one to get alternate byte index
 8466       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8467       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8468     } else {
 8469       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8470       int vlen_enc = vector_length_encoding(this);
 8471       // Multiply each shuffle by two to get byte index
 8472       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8473       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8474 
 8475       // Duplicate to create 2 copies of byte index
 8476       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8477       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8478 
 8479       // Add one to get alternate byte index
 8480       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8481     }
 8482   %}
 8483   ins_pipe( pipe_slow );
 8484 %}
 8485 
 8486 instruct rearrangeS(vec dst, vec shuffle) %{
 8487   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8488             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8489   match(Set dst (VectorRearrange dst shuffle));
 8490   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8491   ins_encode %{
 8492     assert(UseSSE >= 4, "required");
 8493     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8494   %}
 8495   ins_pipe( pipe_slow );
 8496 %}
 8497 
 8498 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8499   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8500             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8501   match(Set dst (VectorRearrange src shuffle));
 8502   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8503   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8504   ins_encode %{
 8505     assert(UseAVX >= 2, "required");
 8506     // Swap src into vtmp1
 8507     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8508     // Shuffle swapped src to get entries from other 128 bit lane
 8509     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8510     // Shuffle original src to get entries from self 128 bit lane
 8511     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8512     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8513     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8514     // Perform the blend
 8515     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8516   %}
 8517   ins_pipe( pipe_slow );
 8518 %}
 8519 
 8520 instruct loadShuffleS_evex(vec dst, vec src) %{
 8521   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8522             VM_Version::supports_avx512bw());
 8523   match(Set dst (VectorLoadShuffle src));
 8524   format %{ "vector_load_shuffle $dst, $src" %}
 8525   ins_encode %{
 8526     int vlen_enc = vector_length_encoding(this);
 8527     if (!VM_Version::supports_avx512vl()) {
 8528       vlen_enc = Assembler::AVX_512bit;
 8529     }
 8530     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8531   %}
 8532   ins_pipe( pipe_slow );
 8533 %}
 8534 
 8535 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8536   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8537             VM_Version::supports_avx512bw());
 8538   match(Set dst (VectorRearrange src shuffle));
 8539   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8540   ins_encode %{
 8541     int vlen_enc = vector_length_encoding(this);
 8542     if (!VM_Version::supports_avx512vl()) {
 8543       vlen_enc = Assembler::AVX_512bit;
 8544     }
 8545     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8546   %}
 8547   ins_pipe( pipe_slow );
 8548 %}
 8549 
 8550 // LoadShuffle/Rearrange for Integer and Float
 8551 
 8552 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8553   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8554             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8555   match(Set dst (VectorLoadShuffle src));
 8556   effect(TEMP dst, TEMP vtmp);
 8557   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8558   ins_encode %{
 8559     assert(UseSSE >= 4, "required");
 8560 
 8561     // Create a byte shuffle mask from int shuffle mask
 8562     // only byte shuffle instruction available on these platforms
 8563 
 8564     // Duplicate and multiply each shuffle by 4
 8565     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8566     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8567     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8568     __ psllw($vtmp$$XMMRegister, 2);
 8569 
 8570     // Duplicate again to create 4 copies of byte index
 8571     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8572     __ psllw($dst$$XMMRegister, 8);
 8573     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8574 
 8575     // Add 3,2,1,0 to get alternate byte index
 8576     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8577     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8578   %}
 8579   ins_pipe( pipe_slow );
 8580 %}
 8581 
 8582 instruct rearrangeI(vec dst, vec shuffle) %{
 8583   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8584             UseAVX == 0);
 8585   match(Set dst (VectorRearrange dst shuffle));
 8586   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8587   ins_encode %{
 8588     assert(UseSSE >= 4, "required");
 8589     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8590   %}
 8591   ins_pipe( pipe_slow );
 8592 %}
 8593 
 8594 instruct loadShuffleI_avx(vec dst, vec src) %{
 8595   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8596             UseAVX > 0);
 8597   match(Set dst (VectorLoadShuffle src));
 8598   format %{ "vector_load_shuffle $dst, $src" %}
 8599   ins_encode %{
 8600     int vlen_enc = vector_length_encoding(this);
 8601     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8602   %}
 8603   ins_pipe( pipe_slow );
 8604 %}
 8605 
 8606 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8607   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8608             UseAVX > 0);
 8609   match(Set dst (VectorRearrange src shuffle));
 8610   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8611   ins_encode %{
 8612     int vlen_enc = vector_length_encoding(this);
 8613     BasicType bt = Matcher::vector_element_basic_type(this);
 8614     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8615   %}
 8616   ins_pipe( pipe_slow );
 8617 %}
 8618 
 8619 // LoadShuffle/Rearrange for Long and Double
 8620 
 8621 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8622   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8623             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8624   match(Set dst (VectorLoadShuffle src));
 8625   effect(TEMP dst, TEMP vtmp);
 8626   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8627   ins_encode %{
 8628     assert(UseAVX >= 2, "required");
 8629 
 8630     int vlen_enc = vector_length_encoding(this);
 8631     // Create a double word shuffle mask from long shuffle mask
 8632     // only double word shuffle instruction available on these platforms
 8633 
 8634     // Multiply each shuffle by two to get double word index
 8635     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8636     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8637 
 8638     // Duplicate each double word shuffle
 8639     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8640     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8641 
 8642     // Add one to get alternate double word index
 8643     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8644   %}
 8645   ins_pipe( pipe_slow );
 8646 %}
 8647 
 8648 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8649   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8650             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8651   match(Set dst (VectorRearrange src shuffle));
 8652   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8653   ins_encode %{
 8654     assert(UseAVX >= 2, "required");
 8655 
 8656     int vlen_enc = vector_length_encoding(this);
 8657     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8658   %}
 8659   ins_pipe( pipe_slow );
 8660 %}
 8661 
 8662 instruct loadShuffleL_evex(vec dst, vec src) %{
 8663   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8664             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8665   match(Set dst (VectorLoadShuffle src));
 8666   format %{ "vector_load_shuffle $dst, $src" %}
 8667   ins_encode %{
 8668     assert(UseAVX > 2, "required");
 8669 
 8670     int vlen_enc = vector_length_encoding(this);
 8671     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8672   %}
 8673   ins_pipe( pipe_slow );
 8674 %}
 8675 
 8676 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8677   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8678             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8679   match(Set dst (VectorRearrange src shuffle));
 8680   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8681   ins_encode %{
 8682     assert(UseAVX > 2, "required");
 8683 
 8684     int vlen_enc = vector_length_encoding(this);
 8685     if (vlen_enc == Assembler::AVX_128bit) {
 8686       vlen_enc = Assembler::AVX_256bit;
 8687     }
 8688     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8689   %}
 8690   ins_pipe( pipe_slow );
 8691 %}
 8692 
 8693 // --------------------------------- FMA --------------------------------------
 8694 // a * b + c
 8695 
 8696 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8697   match(Set c (FmaVF  c (Binary a b)));
 8698   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8699   ins_cost(150);
 8700   ins_encode %{
 8701     assert(UseFMA, "not enabled");
 8702     int vlen_enc = vector_length_encoding(this);
 8703     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8704   %}
 8705   ins_pipe( pipe_slow );
 8706 %}
 8707 
 8708 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8709   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8710   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8711   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8712   ins_cost(150);
 8713   ins_encode %{
 8714     assert(UseFMA, "not enabled");
 8715     int vlen_enc = vector_length_encoding(this);
 8716     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8717   %}
 8718   ins_pipe( pipe_slow );
 8719 %}
 8720 
 8721 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8722   match(Set c (FmaVD  c (Binary a b)));
 8723   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8724   ins_cost(150);
 8725   ins_encode %{
 8726     assert(UseFMA, "not enabled");
 8727     int vlen_enc = vector_length_encoding(this);
 8728     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8729   %}
 8730   ins_pipe( pipe_slow );
 8731 %}
 8732 
 8733 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8734   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8735   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8736   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8737   ins_cost(150);
 8738   ins_encode %{
 8739     assert(UseFMA, "not enabled");
 8740     int vlen_enc = vector_length_encoding(this);
 8741     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8742   %}
 8743   ins_pipe( pipe_slow );
 8744 %}
 8745 
 8746 // --------------------------------- Vector Multiply Add --------------------------------------
 8747 
 8748 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8749   predicate(UseAVX == 0);
 8750   match(Set dst (MulAddVS2VI dst src1));
 8751   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8752   ins_encode %{
 8753     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8754   %}
 8755   ins_pipe( pipe_slow );
 8756 %}
 8757 
 8758 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8759   predicate(UseAVX > 0);
 8760   match(Set dst (MulAddVS2VI src1 src2));
 8761   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8762   ins_encode %{
 8763     int vlen_enc = vector_length_encoding(this);
 8764     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8765   %}
 8766   ins_pipe( pipe_slow );
 8767 %}
 8768 
 8769 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8770 
 8771 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8772   predicate(VM_Version::supports_avx512_vnni());
 8773   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8774   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8775   ins_encode %{
 8776     assert(UseAVX > 2, "required");
 8777     int vlen_enc = vector_length_encoding(this);
 8778     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8779   %}
 8780   ins_pipe( pipe_slow );
 8781   ins_cost(10);
 8782 %}
 8783 
 8784 // --------------------------------- PopCount --------------------------------------
 8785 
 8786 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8787   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8788   match(Set dst (PopCountVI src));
 8789   match(Set dst (PopCountVL src));
 8790   format %{ "vector_popcount_integral $dst, $src" %}
 8791   ins_encode %{
 8792     int opcode = this->ideal_Opcode();
 8793     int vlen_enc = vector_length_encoding(this, $src);
 8794     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8795     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8796   %}
 8797   ins_pipe( pipe_slow );
 8798 %}
 8799 
 8800 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8801   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8802   match(Set dst (PopCountVI src mask));
 8803   match(Set dst (PopCountVL src mask));
 8804   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8805   ins_encode %{
 8806     int vlen_enc = vector_length_encoding(this, $src);
 8807     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8808     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8809     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8810   %}
 8811   ins_pipe( pipe_slow );
 8812 %}
 8813 
 8814 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8815   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8816   match(Set dst (PopCountVI src));
 8817   match(Set dst (PopCountVL src));
 8818   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8819   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8820   ins_encode %{
 8821     int opcode = this->ideal_Opcode();
 8822     int vlen_enc = vector_length_encoding(this, $src);
 8823     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8824     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8825                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8826   %}
 8827   ins_pipe( pipe_slow );
 8828 %}
 8829 
 8830 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8831 
 8832 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8833   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8834                                               Matcher::vector_length_in_bytes(n->in(1))));
 8835   match(Set dst (CountTrailingZerosV src));
 8836   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8837   ins_cost(400);
 8838   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8839   ins_encode %{
 8840     int vlen_enc = vector_length_encoding(this, $src);
 8841     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8842     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8843                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8844   %}
 8845   ins_pipe( pipe_slow );
 8846 %}
 8847 
 8848 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8849   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8850             VM_Version::supports_avx512cd() &&
 8851             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8852   match(Set dst (CountTrailingZerosV src));
 8853   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8854   ins_cost(400);
 8855   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8856   ins_encode %{
 8857     int vlen_enc = vector_length_encoding(this, $src);
 8858     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8859     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8860                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8861   %}
 8862   ins_pipe( pipe_slow );
 8863 %}
 8864 
 8865 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8866   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8867   match(Set dst (CountTrailingZerosV src));
 8868   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8869   ins_cost(400);
 8870   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8871   ins_encode %{
 8872     int vlen_enc = vector_length_encoding(this, $src);
 8873     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8874     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8875                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8876                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8877   %}
 8878   ins_pipe( pipe_slow );
 8879 %}
 8880 
 8881 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8882   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8883   match(Set dst (CountTrailingZerosV src));
 8884   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8885   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8886   ins_encode %{
 8887     int vlen_enc = vector_length_encoding(this, $src);
 8888     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8889     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8890                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8891   %}
 8892   ins_pipe( pipe_slow );
 8893 %}
 8894 
 8895 
 8896 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8897 
 8898 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8899   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8900   effect(TEMP dst);
 8901   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8902   ins_encode %{
 8903     int vector_len = vector_length_encoding(this);
 8904     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8905   %}
 8906   ins_pipe( pipe_slow );
 8907 %}
 8908 
 8909 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8910   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8911   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8912   effect(TEMP dst);
 8913   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8914   ins_encode %{
 8915     int vector_len = vector_length_encoding(this);
 8916     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8917   %}
 8918   ins_pipe( pipe_slow );
 8919 %}
 8920 
 8921 // --------------------------------- Rotation Operations ----------------------------------
 8922 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8923   match(Set dst (RotateLeftV src shift));
 8924   match(Set dst (RotateRightV src shift));
 8925   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8926   ins_encode %{
 8927     int opcode      = this->ideal_Opcode();
 8928     int vector_len  = vector_length_encoding(this);
 8929     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8930     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 8931   %}
 8932   ins_pipe( pipe_slow );
 8933 %}
 8934 
 8935 instruct vprorate(vec dst, vec src, vec shift) %{
 8936   match(Set dst (RotateLeftV src shift));
 8937   match(Set dst (RotateRightV src shift));
 8938   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 8939   ins_encode %{
 8940     int opcode      = this->ideal_Opcode();
 8941     int vector_len  = vector_length_encoding(this);
 8942     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8943     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 8944   %}
 8945   ins_pipe( pipe_slow );
 8946 %}
 8947 
 8948 // ---------------------------------- Masked Operations ------------------------------------
 8949 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 8950   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 8951   match(Set dst (LoadVectorMasked mem mask));
 8952   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8953   ins_encode %{
 8954     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 8955     int vlen_enc = vector_length_encoding(this);
 8956     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 8957   %}
 8958   ins_pipe( pipe_slow );
 8959 %}
 8960 
 8961 
 8962 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 8963   predicate(n->in(3)->bottom_type()->isa_vectmask());
 8964   match(Set dst (LoadVectorMasked mem mask));
 8965   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8966   ins_encode %{
 8967     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 8968     int vector_len = vector_length_encoding(this);
 8969     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 8970   %}
 8971   ins_pipe( pipe_slow );
 8972 %}
 8973 
 8974 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 8975   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8976   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8977   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8978   ins_encode %{
 8979     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8980     int vlen_enc = vector_length_encoding(src_node);
 8981     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8982     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8983   %}
 8984   ins_pipe( pipe_slow );
 8985 %}
 8986 
 8987 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 8988   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8989   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8990   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8991   ins_encode %{
 8992     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8993     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8994     int vlen_enc = vector_length_encoding(src_node);
 8995     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 8996   %}
 8997   ins_pipe( pipe_slow );
 8998 %}
 8999 
 9000 #ifdef _LP64
 9001 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9002   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9003   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9004   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9005   ins_encode %{
 9006     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9007     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9008 
 9009     Label DONE;
 9010     int vlen_enc = vector_length_encoding(this, $src1);
 9011     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9012 
 9013     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9014     __ mov64($dst$$Register, -1L);
 9015     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9016     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9017     __ jccb(Assembler::carrySet, DONE);
 9018     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9019     __ notq($dst$$Register);
 9020     __ tzcntq($dst$$Register, $dst$$Register);
 9021     __ bind(DONE);
 9022   %}
 9023   ins_pipe( pipe_slow );
 9024 %}
 9025 
 9026 
 9027 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9028   match(Set dst (VectorMaskGen len));
 9029   effect(TEMP temp, KILL cr);
 9030   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9031   ins_encode %{
 9032     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9033   %}
 9034   ins_pipe( pipe_slow );
 9035 %}
 9036 
 9037 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9038   match(Set dst (VectorMaskGen len));
 9039   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9040   effect(TEMP temp);
 9041   ins_encode %{
 9042     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9043     __ kmovql($dst$$KRegister, $temp$$Register);
 9044   %}
 9045   ins_pipe( pipe_slow );
 9046 %}
 9047 
 9048 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9049   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9050   match(Set dst (VectorMaskToLong mask));
 9051   effect(TEMP dst, KILL cr);
 9052   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9053   ins_encode %{
 9054     int opcode = this->ideal_Opcode();
 9055     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9056     int mask_len = Matcher::vector_length(this, $mask);
 9057     int mask_size = mask_len * type2aelembytes(mbt);
 9058     int vlen_enc = vector_length_encoding(this, $mask);
 9059     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9060                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9061   %}
 9062   ins_pipe( pipe_slow );
 9063 %}
 9064 
 9065 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9066   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9067   match(Set dst (VectorMaskToLong mask));
 9068   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9069   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9070   ins_encode %{
 9071     int opcode = this->ideal_Opcode();
 9072     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9073     int mask_len = Matcher::vector_length(this, $mask);
 9074     int vlen_enc = vector_length_encoding(this, $mask);
 9075     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9076                              $dst$$Register, mask_len, mbt, vlen_enc);
 9077   %}
 9078   ins_pipe( pipe_slow );
 9079 %}
 9080 
 9081 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9082   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9083   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9084   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9085   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9086   ins_encode %{
 9087     int opcode = this->ideal_Opcode();
 9088     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9089     int mask_len = Matcher::vector_length(this, $mask);
 9090     int vlen_enc = vector_length_encoding(this, $mask);
 9091     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9092                              $dst$$Register, mask_len, mbt, vlen_enc);
 9093   %}
 9094   ins_pipe( pipe_slow );
 9095 %}
 9096 
 9097 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9098   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9099   match(Set dst (VectorMaskTrueCount mask));
 9100   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9101   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9102   ins_encode %{
 9103     int opcode = this->ideal_Opcode();
 9104     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9105     int mask_len = Matcher::vector_length(this, $mask);
 9106     int mask_size = mask_len * type2aelembytes(mbt);
 9107     int vlen_enc = vector_length_encoding(this, $mask);
 9108     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9109                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9110   %}
 9111   ins_pipe( pipe_slow );
 9112 %}
 9113 
 9114 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9115   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9116   match(Set dst (VectorMaskTrueCount mask));
 9117   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9118   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9119   ins_encode %{
 9120     int opcode = this->ideal_Opcode();
 9121     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9122     int mask_len = Matcher::vector_length(this, $mask);
 9123     int vlen_enc = vector_length_encoding(this, $mask);
 9124     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9125                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9126   %}
 9127   ins_pipe( pipe_slow );
 9128 %}
 9129 
 9130 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9131   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9132   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9133   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9134   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9135   ins_encode %{
 9136     int opcode = this->ideal_Opcode();
 9137     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9138     int mask_len = Matcher::vector_length(this, $mask);
 9139     int vlen_enc = vector_length_encoding(this, $mask);
 9140     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9141                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9142   %}
 9143   ins_pipe( pipe_slow );
 9144 %}
 9145 
 9146 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9147   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9148   match(Set dst (VectorMaskFirstTrue mask));
 9149   match(Set dst (VectorMaskLastTrue mask));
 9150   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9151   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9152   ins_encode %{
 9153     int opcode = this->ideal_Opcode();
 9154     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9155     int mask_len = Matcher::vector_length(this, $mask);
 9156     int mask_size = mask_len * type2aelembytes(mbt);
 9157     int vlen_enc = vector_length_encoding(this, $mask);
 9158     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9159                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9160   %}
 9161   ins_pipe( pipe_slow );
 9162 %}
 9163 
 9164 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9165   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9166   match(Set dst (VectorMaskFirstTrue mask));
 9167   match(Set dst (VectorMaskLastTrue mask));
 9168   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9169   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9170   ins_encode %{
 9171     int opcode = this->ideal_Opcode();
 9172     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9173     int mask_len = Matcher::vector_length(this, $mask);
 9174     int vlen_enc = vector_length_encoding(this, $mask);
 9175     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9176                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9177   %}
 9178   ins_pipe( pipe_slow );
 9179 %}
 9180 
 9181 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9182   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9183   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9184   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9185   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9186   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9187   ins_encode %{
 9188     int opcode = this->ideal_Opcode();
 9189     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9190     int mask_len = Matcher::vector_length(this, $mask);
 9191     int vlen_enc = vector_length_encoding(this, $mask);
 9192     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9193                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9194   %}
 9195   ins_pipe( pipe_slow );
 9196 %}
 9197 
 9198 // --------------------------------- Compress/Expand Operations ---------------------------
 9199 
 9200 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9201   match(Set dst (CompressV src mask));
 9202   match(Set dst (ExpandV src mask));
 9203   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9204   ins_encode %{
 9205     int opcode = this->ideal_Opcode();
 9206     int vector_len = vector_length_encoding(this);
 9207     BasicType bt  = Matcher::vector_element_basic_type(this);
 9208     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9209   %}
 9210   ins_pipe( pipe_slow );
 9211 %}
 9212 
 9213 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9214   match(Set dst (CompressM mask));
 9215   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9216   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9217   ins_encode %{
 9218     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9219     int mask_len = Matcher::vector_length(this);
 9220     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9221   %}
 9222   ins_pipe( pipe_slow );
 9223 %}
 9224 
 9225 #endif // _LP64
 9226 
 9227 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9228 
 9229 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9230   predicate(!VM_Version::supports_gfni());
 9231   match(Set dst (ReverseV src));
 9232   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9233   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9234   ins_encode %{
 9235     int vec_enc = vector_length_encoding(this);
 9236     BasicType bt = Matcher::vector_element_basic_type(this);
 9237     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9238                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9239   %}
 9240   ins_pipe( pipe_slow );
 9241 %}
 9242 
 9243 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9244   predicate(VM_Version::supports_gfni());
 9245   match(Set dst (ReverseV src));
 9246   effect(TEMP dst, TEMP xtmp);
 9247   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9248   ins_encode %{
 9249     int vec_enc = vector_length_encoding(this);
 9250     BasicType bt  = Matcher::vector_element_basic_type(this);
 9251     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9252     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9253                                $xtmp$$XMMRegister);
 9254   %}
 9255   ins_pipe( pipe_slow );
 9256 %}
 9257 
 9258 instruct vreverse_byte_reg(vec dst, vec src) %{
 9259   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9260   match(Set dst (ReverseBytesV src));
 9261   effect(TEMP dst);
 9262   format %{ "vector_reverse_byte $dst, $src" %}
 9263   ins_encode %{
 9264     int vec_enc = vector_length_encoding(this);
 9265     BasicType bt = Matcher::vector_element_basic_type(this);
 9266     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9267   %}
 9268   ins_pipe( pipe_slow );
 9269 %}
 9270 
 9271 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9272   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9273   match(Set dst (ReverseBytesV src));
 9274   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9275   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9276   ins_encode %{
 9277     int vec_enc = vector_length_encoding(this);
 9278     BasicType bt = Matcher::vector_element_basic_type(this);
 9279     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9280                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9281   %}
 9282   ins_pipe( pipe_slow );
 9283 %}
 9284 
 9285 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9286 
 9287 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9288   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9289                                               Matcher::vector_length_in_bytes(n->in(1))));
 9290   match(Set dst (CountLeadingZerosV src));
 9291   format %{ "vector_count_leading_zeros $dst, $src" %}
 9292   ins_encode %{
 9293      int vlen_enc = vector_length_encoding(this, $src);
 9294      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9295      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9296                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9297   %}
 9298   ins_pipe( pipe_slow );
 9299 %}
 9300 
 9301 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9302   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9303                                               Matcher::vector_length_in_bytes(n->in(1))));
 9304   match(Set dst (CountLeadingZerosV src mask));
 9305   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9306   ins_encode %{
 9307     int vlen_enc = vector_length_encoding(this, $src);
 9308     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9309     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9310     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9311                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9312   %}
 9313   ins_pipe( pipe_slow );
 9314 %}
 9315 
 9316 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9317   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9318             VM_Version::supports_avx512cd() &&
 9319             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9320   match(Set dst (CountLeadingZerosV src));
 9321   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9322   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9323   ins_encode %{
 9324     int vlen_enc = vector_length_encoding(this, $src);
 9325     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9326     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9327                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9328   %}
 9329   ins_pipe( pipe_slow );
 9330 %}
 9331 
 9332 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9333   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9334   match(Set dst (CountLeadingZerosV src));
 9335   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9336   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9337   ins_encode %{
 9338     int vlen_enc = vector_length_encoding(this, $src);
 9339     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9340     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9341                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9342                                        $rtmp$$Register, true, vlen_enc);
 9343   %}
 9344   ins_pipe( pipe_slow );
 9345 %}
 9346 
 9347 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9348   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9349             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9350   match(Set dst (CountLeadingZerosV src));
 9351   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9352   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9353   ins_encode %{
 9354     int vlen_enc = vector_length_encoding(this, $src);
 9355     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9356     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9357                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9358   %}
 9359   ins_pipe( pipe_slow );
 9360 %}
 9361 
 9362 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9363   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9364             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9365   match(Set dst (CountLeadingZerosV src));
 9366   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9367   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9368   ins_encode %{
 9369     int vlen_enc = vector_length_encoding(this, $src);
 9370     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9371     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9372                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9373   %}
 9374   ins_pipe( pipe_slow );
 9375 %}
 9376 
 9377 // ---------------------------------- Vector Masked Operations ------------------------------------
 9378 
 9379 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9380   match(Set dst (AddVB (Binary dst src2) mask));
 9381   match(Set dst (AddVS (Binary dst src2) mask));
 9382   match(Set dst (AddVI (Binary dst src2) mask));
 9383   match(Set dst (AddVL (Binary dst src2) mask));
 9384   match(Set dst (AddVF (Binary dst src2) mask));
 9385   match(Set dst (AddVD (Binary dst src2) mask));
 9386   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9387   ins_encode %{
 9388     int vlen_enc = vector_length_encoding(this);
 9389     BasicType bt = Matcher::vector_element_basic_type(this);
 9390     int opc = this->ideal_Opcode();
 9391     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9392                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9393   %}
 9394   ins_pipe( pipe_slow );
 9395 %}
 9396 
 9397 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9398   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9399   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9400   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9401   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9402   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9403   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9404   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9405   ins_encode %{
 9406     int vlen_enc = vector_length_encoding(this);
 9407     BasicType bt = Matcher::vector_element_basic_type(this);
 9408     int opc = this->ideal_Opcode();
 9409     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9410                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9411   %}
 9412   ins_pipe( pipe_slow );
 9413 %}
 9414 
 9415 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9416   match(Set dst (XorV (Binary dst src2) mask));
 9417   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9418   ins_encode %{
 9419     int vlen_enc = vector_length_encoding(this);
 9420     BasicType bt = Matcher::vector_element_basic_type(this);
 9421     int opc = this->ideal_Opcode();
 9422     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9423                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9424   %}
 9425   ins_pipe( pipe_slow );
 9426 %}
 9427 
 9428 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9429   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9430   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9431   ins_encode %{
 9432     int vlen_enc = vector_length_encoding(this);
 9433     BasicType bt = Matcher::vector_element_basic_type(this);
 9434     int opc = this->ideal_Opcode();
 9435     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9436                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9437   %}
 9438   ins_pipe( pipe_slow );
 9439 %}
 9440 
 9441 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9442   match(Set dst (OrV (Binary dst src2) mask));
 9443   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9444   ins_encode %{
 9445     int vlen_enc = vector_length_encoding(this);
 9446     BasicType bt = Matcher::vector_element_basic_type(this);
 9447     int opc = this->ideal_Opcode();
 9448     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9449                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9450   %}
 9451   ins_pipe( pipe_slow );
 9452 %}
 9453 
 9454 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9455   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9456   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9457   ins_encode %{
 9458     int vlen_enc = vector_length_encoding(this);
 9459     BasicType bt = Matcher::vector_element_basic_type(this);
 9460     int opc = this->ideal_Opcode();
 9461     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9462                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9463   %}
 9464   ins_pipe( pipe_slow );
 9465 %}
 9466 
 9467 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9468   match(Set dst (AndV (Binary dst src2) mask));
 9469   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9470   ins_encode %{
 9471     int vlen_enc = vector_length_encoding(this);
 9472     BasicType bt = Matcher::vector_element_basic_type(this);
 9473     int opc = this->ideal_Opcode();
 9474     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9475                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9476   %}
 9477   ins_pipe( pipe_slow );
 9478 %}
 9479 
 9480 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9481   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9482   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9483   ins_encode %{
 9484     int vlen_enc = vector_length_encoding(this);
 9485     BasicType bt = Matcher::vector_element_basic_type(this);
 9486     int opc = this->ideal_Opcode();
 9487     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9488                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9489   %}
 9490   ins_pipe( pipe_slow );
 9491 %}
 9492 
 9493 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9494   match(Set dst (SubVB (Binary dst src2) mask));
 9495   match(Set dst (SubVS (Binary dst src2) mask));
 9496   match(Set dst (SubVI (Binary dst src2) mask));
 9497   match(Set dst (SubVL (Binary dst src2) mask));
 9498   match(Set dst (SubVF (Binary dst src2) mask));
 9499   match(Set dst (SubVD (Binary dst src2) mask));
 9500   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9501   ins_encode %{
 9502     int vlen_enc = vector_length_encoding(this);
 9503     BasicType bt = Matcher::vector_element_basic_type(this);
 9504     int opc = this->ideal_Opcode();
 9505     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9506                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9507   %}
 9508   ins_pipe( pipe_slow );
 9509 %}
 9510 
 9511 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9512   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9513   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9514   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9515   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9516   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9517   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9518   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9519   ins_encode %{
 9520     int vlen_enc = vector_length_encoding(this);
 9521     BasicType bt = Matcher::vector_element_basic_type(this);
 9522     int opc = this->ideal_Opcode();
 9523     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9524                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9525   %}
 9526   ins_pipe( pipe_slow );
 9527 %}
 9528 
 9529 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9530   match(Set dst (MulVS (Binary dst src2) mask));
 9531   match(Set dst (MulVI (Binary dst src2) mask));
 9532   match(Set dst (MulVL (Binary dst src2) mask));
 9533   match(Set dst (MulVF (Binary dst src2) mask));
 9534   match(Set dst (MulVD (Binary dst src2) mask));
 9535   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9536   ins_encode %{
 9537     int vlen_enc = vector_length_encoding(this);
 9538     BasicType bt = Matcher::vector_element_basic_type(this);
 9539     int opc = this->ideal_Opcode();
 9540     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9541                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9542   %}
 9543   ins_pipe( pipe_slow );
 9544 %}
 9545 
 9546 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9547   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9548   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9549   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9550   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9551   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9552   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9553   ins_encode %{
 9554     int vlen_enc = vector_length_encoding(this);
 9555     BasicType bt = Matcher::vector_element_basic_type(this);
 9556     int opc = this->ideal_Opcode();
 9557     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9558                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9559   %}
 9560   ins_pipe( pipe_slow );
 9561 %}
 9562 
 9563 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9564   match(Set dst (SqrtVF dst mask));
 9565   match(Set dst (SqrtVD dst mask));
 9566   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9567   ins_encode %{
 9568     int vlen_enc = vector_length_encoding(this);
 9569     BasicType bt = Matcher::vector_element_basic_type(this);
 9570     int opc = this->ideal_Opcode();
 9571     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9572                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9573   %}
 9574   ins_pipe( pipe_slow );
 9575 %}
 9576 
 9577 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9578   match(Set dst (DivVF (Binary dst src2) mask));
 9579   match(Set dst (DivVD (Binary dst src2) mask));
 9580   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9581   ins_encode %{
 9582     int vlen_enc = vector_length_encoding(this);
 9583     BasicType bt = Matcher::vector_element_basic_type(this);
 9584     int opc = this->ideal_Opcode();
 9585     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9586                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9587   %}
 9588   ins_pipe( pipe_slow );
 9589 %}
 9590 
 9591 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9592   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9593   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9594   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9595   ins_encode %{
 9596     int vlen_enc = vector_length_encoding(this);
 9597     BasicType bt = Matcher::vector_element_basic_type(this);
 9598     int opc = this->ideal_Opcode();
 9599     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9600                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9601   %}
 9602   ins_pipe( pipe_slow );
 9603 %}
 9604 
 9605 
 9606 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9607   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9608   match(Set dst (RotateRightV (Binary dst shift) mask));
 9609   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9610   ins_encode %{
 9611     int vlen_enc = vector_length_encoding(this);
 9612     BasicType bt = Matcher::vector_element_basic_type(this);
 9613     int opc = this->ideal_Opcode();
 9614     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9615                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9616   %}
 9617   ins_pipe( pipe_slow );
 9618 %}
 9619 
 9620 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9621   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9622   match(Set dst (RotateRightV (Binary dst src2) mask));
 9623   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9624   ins_encode %{
 9625     int vlen_enc = vector_length_encoding(this);
 9626     BasicType bt = Matcher::vector_element_basic_type(this);
 9627     int opc = this->ideal_Opcode();
 9628     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9629                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9630   %}
 9631   ins_pipe( pipe_slow );
 9632 %}
 9633 
 9634 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9635   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9636   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9637   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9638   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9639   ins_encode %{
 9640     int vlen_enc = vector_length_encoding(this);
 9641     BasicType bt = Matcher::vector_element_basic_type(this);
 9642     int opc = this->ideal_Opcode();
 9643     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9644                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9645   %}
 9646   ins_pipe( pipe_slow );
 9647 %}
 9648 
 9649 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9650   predicate(!n->as_ShiftV()->is_var_shift());
 9651   match(Set dst (LShiftVS (Binary dst src2) mask));
 9652   match(Set dst (LShiftVI (Binary dst src2) mask));
 9653   match(Set dst (LShiftVL (Binary dst src2) mask));
 9654   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9655   ins_encode %{
 9656     int vlen_enc = vector_length_encoding(this);
 9657     BasicType bt = Matcher::vector_element_basic_type(this);
 9658     int opc = this->ideal_Opcode();
 9659     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9660                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9661   %}
 9662   ins_pipe( pipe_slow );
 9663 %}
 9664 
 9665 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9666   predicate(n->as_ShiftV()->is_var_shift());
 9667   match(Set dst (LShiftVS (Binary dst src2) mask));
 9668   match(Set dst (LShiftVI (Binary dst src2) mask));
 9669   match(Set dst (LShiftVL (Binary dst src2) mask));
 9670   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9671   ins_encode %{
 9672     int vlen_enc = vector_length_encoding(this);
 9673     BasicType bt = Matcher::vector_element_basic_type(this);
 9674     int opc = this->ideal_Opcode();
 9675     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9676                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9677   %}
 9678   ins_pipe( pipe_slow );
 9679 %}
 9680 
 9681 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9682   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9683   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9684   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9685   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9686   ins_encode %{
 9687     int vlen_enc = vector_length_encoding(this);
 9688     BasicType bt = Matcher::vector_element_basic_type(this);
 9689     int opc = this->ideal_Opcode();
 9690     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9691                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9692   %}
 9693   ins_pipe( pipe_slow );
 9694 %}
 9695 
 9696 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9697   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9698   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9699   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9700   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9701   ins_encode %{
 9702     int vlen_enc = vector_length_encoding(this);
 9703     BasicType bt = Matcher::vector_element_basic_type(this);
 9704     int opc = this->ideal_Opcode();
 9705     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9706                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9707   %}
 9708   ins_pipe( pipe_slow );
 9709 %}
 9710 
 9711 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9712   predicate(!n->as_ShiftV()->is_var_shift());
 9713   match(Set dst (RShiftVS (Binary dst src2) mask));
 9714   match(Set dst (RShiftVI (Binary dst src2) mask));
 9715   match(Set dst (RShiftVL (Binary dst src2) mask));
 9716   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9717   ins_encode %{
 9718     int vlen_enc = vector_length_encoding(this);
 9719     BasicType bt = Matcher::vector_element_basic_type(this);
 9720     int opc = this->ideal_Opcode();
 9721     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9722                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9723   %}
 9724   ins_pipe( pipe_slow );
 9725 %}
 9726 
 9727 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9728   predicate(n->as_ShiftV()->is_var_shift());
 9729   match(Set dst (RShiftVS (Binary dst src2) mask));
 9730   match(Set dst (RShiftVI (Binary dst src2) mask));
 9731   match(Set dst (RShiftVL (Binary dst src2) mask));
 9732   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9733   ins_encode %{
 9734     int vlen_enc = vector_length_encoding(this);
 9735     BasicType bt = Matcher::vector_element_basic_type(this);
 9736     int opc = this->ideal_Opcode();
 9737     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9738                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9739   %}
 9740   ins_pipe( pipe_slow );
 9741 %}
 9742 
 9743 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9744   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9745   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9746   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9747   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9748   ins_encode %{
 9749     int vlen_enc = vector_length_encoding(this);
 9750     BasicType bt = Matcher::vector_element_basic_type(this);
 9751     int opc = this->ideal_Opcode();
 9752     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9753                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9754   %}
 9755   ins_pipe( pipe_slow );
 9756 %}
 9757 
 9758 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9759   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9760   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9761   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9762   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9763   ins_encode %{
 9764     int vlen_enc = vector_length_encoding(this);
 9765     BasicType bt = Matcher::vector_element_basic_type(this);
 9766     int opc = this->ideal_Opcode();
 9767     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9768                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9769   %}
 9770   ins_pipe( pipe_slow );
 9771 %}
 9772 
 9773 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9774   predicate(!n->as_ShiftV()->is_var_shift());
 9775   match(Set dst (URShiftVS (Binary dst src2) mask));
 9776   match(Set dst (URShiftVI (Binary dst src2) mask));
 9777   match(Set dst (URShiftVL (Binary dst src2) mask));
 9778   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9779   ins_encode %{
 9780     int vlen_enc = vector_length_encoding(this);
 9781     BasicType bt = Matcher::vector_element_basic_type(this);
 9782     int opc = this->ideal_Opcode();
 9783     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9784                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9785   %}
 9786   ins_pipe( pipe_slow );
 9787 %}
 9788 
 9789 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9790   predicate(n->as_ShiftV()->is_var_shift());
 9791   match(Set dst (URShiftVS (Binary dst src2) mask));
 9792   match(Set dst (URShiftVI (Binary dst src2) mask));
 9793   match(Set dst (URShiftVL (Binary dst src2) mask));
 9794   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9795   ins_encode %{
 9796     int vlen_enc = vector_length_encoding(this);
 9797     BasicType bt = Matcher::vector_element_basic_type(this);
 9798     int opc = this->ideal_Opcode();
 9799     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9800                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9801   %}
 9802   ins_pipe( pipe_slow );
 9803 %}
 9804 
 9805 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9806   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9807   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9808   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9809   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9810   ins_encode %{
 9811     int vlen_enc = vector_length_encoding(this);
 9812     BasicType bt = Matcher::vector_element_basic_type(this);
 9813     int opc = this->ideal_Opcode();
 9814     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9815                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9816   %}
 9817   ins_pipe( pipe_slow );
 9818 %}
 9819 
 9820 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9821   match(Set dst (MaxV (Binary dst src2) mask));
 9822   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9823   ins_encode %{
 9824     int vlen_enc = vector_length_encoding(this);
 9825     BasicType bt = Matcher::vector_element_basic_type(this);
 9826     int opc = this->ideal_Opcode();
 9827     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9828                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9829   %}
 9830   ins_pipe( pipe_slow );
 9831 %}
 9832 
 9833 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9834   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9835   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9836   ins_encode %{
 9837     int vlen_enc = vector_length_encoding(this);
 9838     BasicType bt = Matcher::vector_element_basic_type(this);
 9839     int opc = this->ideal_Opcode();
 9840     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9841                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9842   %}
 9843   ins_pipe( pipe_slow );
 9844 %}
 9845 
 9846 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9847   match(Set dst (MinV (Binary dst src2) mask));
 9848   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9849   ins_encode %{
 9850     int vlen_enc = vector_length_encoding(this);
 9851     BasicType bt = Matcher::vector_element_basic_type(this);
 9852     int opc = this->ideal_Opcode();
 9853     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9854                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9855   %}
 9856   ins_pipe( pipe_slow );
 9857 %}
 9858 
 9859 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9860   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9861   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9862   ins_encode %{
 9863     int vlen_enc = vector_length_encoding(this);
 9864     BasicType bt = Matcher::vector_element_basic_type(this);
 9865     int opc = this->ideal_Opcode();
 9866     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9867                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9868   %}
 9869   ins_pipe( pipe_slow );
 9870 %}
 9871 
 9872 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9873   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9874   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9875   ins_encode %{
 9876     int vlen_enc = vector_length_encoding(this);
 9877     BasicType bt = Matcher::vector_element_basic_type(this);
 9878     int opc = this->ideal_Opcode();
 9879     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9880                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9881   %}
 9882   ins_pipe( pipe_slow );
 9883 %}
 9884 
 9885 instruct vabs_masked(vec dst, kReg mask) %{
 9886   match(Set dst (AbsVB dst mask));
 9887   match(Set dst (AbsVS dst mask));
 9888   match(Set dst (AbsVI dst mask));
 9889   match(Set dst (AbsVL dst mask));
 9890   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9891   ins_encode %{
 9892     int vlen_enc = vector_length_encoding(this);
 9893     BasicType bt = Matcher::vector_element_basic_type(this);
 9894     int opc = this->ideal_Opcode();
 9895     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9896                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9897   %}
 9898   ins_pipe( pipe_slow );
 9899 %}
 9900 
 9901 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9902   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9903   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9904   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9905   ins_encode %{
 9906     assert(UseFMA, "Needs FMA instructions support.");
 9907     int vlen_enc = vector_length_encoding(this);
 9908     BasicType bt = Matcher::vector_element_basic_type(this);
 9909     int opc = this->ideal_Opcode();
 9910     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9911                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9912   %}
 9913   ins_pipe( pipe_slow );
 9914 %}
 9915 
 9916 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9917   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9918   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9919   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9920   ins_encode %{
 9921     assert(UseFMA, "Needs FMA instructions support.");
 9922     int vlen_enc = vector_length_encoding(this);
 9923     BasicType bt = Matcher::vector_element_basic_type(this);
 9924     int opc = this->ideal_Opcode();
 9925     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9926                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9927   %}
 9928   ins_pipe( pipe_slow );
 9929 %}
 9930 
 9931 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
 9932   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
 9933   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
 9934   ins_encode %{
 9935     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 9936     int vlen_enc = vector_length_encoding(this, $src1);
 9937     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9938 
 9939     // Comparison i
 9940     switch (src1_elem_bt) {
 9941       case T_BYTE: {
 9942         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9943         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9944         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9945         break;
 9946       }
 9947       case T_SHORT: {
 9948         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9949         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9950         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9951         break;
 9952       }
 9953       case T_INT: {
 9954         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9955         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9956         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9957         break;
 9958       }
 9959       case T_LONG: {
 9960         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9961         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9962         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9963         break;
 9964       }
 9965       case T_FLOAT: {
 9966         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9967         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9968         break;
 9969       }
 9970       case T_DOUBLE: {
 9971         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9972         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9973         break;
 9974       }
 9975       default: assert(false, "%s", type2name(src1_elem_bt)); break;
 9976     }
 9977   %}
 9978   ins_pipe( pipe_slow );
 9979 %}
 9980 
 9981 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
 9982   predicate(Matcher::vector_length(n) <= 32);
 9983   match(Set dst (MaskAll src));
 9984   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
 9985   ins_encode %{
 9986     int mask_len = Matcher::vector_length(this);
 9987     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
 9988   %}
 9989   ins_pipe( pipe_slow );
 9990 %}
 9991 
 9992 #ifdef _LP64
 9993 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
 9994   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
 9995   match(Set dst (XorVMask src (MaskAll cnt)));
 9996   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
 9997   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
 9998   ins_encode %{
 9999     uint masklen = Matcher::vector_length(this);
10000     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10001   %}
10002   ins_pipe( pipe_slow );
10003 %}
10004 
10005 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10006   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10007             (Matcher::vector_length(n) == 16) ||
10008             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10009   match(Set dst (XorVMask src (MaskAll cnt)));
10010   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10011   ins_encode %{
10012     uint masklen = Matcher::vector_length(this);
10013     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10014   %}
10015   ins_pipe( pipe_slow );
10016 %}
10017 
10018 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10019   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10020   match(Set dst (VectorLongToMask src));
10021   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10022   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10023   ins_encode %{
10024     int mask_len = Matcher::vector_length(this);
10025     int vec_enc  = vector_length_encoding(mask_len);
10026     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10027                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10028   %}
10029   ins_pipe( pipe_slow );
10030 %}
10031 
10032 
10033 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10034   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10035   match(Set dst (VectorLongToMask src));
10036   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10037   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10038   ins_encode %{
10039     int mask_len = Matcher::vector_length(this);
10040     assert(mask_len <= 32, "invalid mask length");
10041     int vec_enc  = vector_length_encoding(mask_len);
10042     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10043                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10044   %}
10045   ins_pipe( pipe_slow );
10046 %}
10047 
10048 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10049   predicate(n->bottom_type()->isa_vectmask());
10050   match(Set dst (VectorLongToMask src));
10051   format %{ "long_to_mask_evex $dst, $src\t!" %}
10052   ins_encode %{
10053     __ kmov($dst$$KRegister, $src$$Register);
10054   %}
10055   ins_pipe( pipe_slow );
10056 %}
10057 #endif
10058 
10059 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10060   match(Set dst (AndVMask src1 src2));
10061   match(Set dst (OrVMask src1 src2));
10062   match(Set dst (XorVMask src1 src2));
10063   effect(TEMP kscratch);
10064   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10065   ins_encode %{
10066     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10067     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10068     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10069     uint masklen = Matcher::vector_length(this);
10070     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10071     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10072   %}
10073   ins_pipe( pipe_slow );
10074 %}
10075 
10076 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10077   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10078   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10079   ins_encode %{
10080     int vlen_enc = vector_length_encoding(this);
10081     BasicType bt = Matcher::vector_element_basic_type(this);
10082     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10083                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10084   %}
10085   ins_pipe( pipe_slow );
10086 %}
10087 
10088 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10089   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10090   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10091   ins_encode %{
10092     int vlen_enc = vector_length_encoding(this);
10093     BasicType bt = Matcher::vector_element_basic_type(this);
10094     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10095                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10096   %}
10097   ins_pipe( pipe_slow );
10098 %}
10099 
10100 instruct castMM(kReg dst)
10101 %{
10102   match(Set dst (CastVV dst));
10103 
10104   size(0);
10105   format %{ "# castVV of $dst" %}
10106   ins_encode(/* empty encoding */);
10107   ins_cost(0);
10108   ins_pipe(empty);
10109 %}
10110 
10111 instruct castVV(vec dst)
10112 %{
10113   match(Set dst (CastVV dst));
10114 
10115   size(0);
10116   format %{ "# castVV of $dst" %}
10117   ins_encode(/* empty encoding */);
10118   ins_cost(0);
10119   ins_pipe(empty);
10120 %}
10121 
10122 instruct castVVLeg(legVec dst)
10123 %{
10124   match(Set dst (CastVV dst));
10125 
10126   size(0);
10127   format %{ "# castVV of $dst" %}
10128   ins_encode(/* empty encoding */);
10129   ins_cost(0);
10130   ins_pipe(empty);
10131 %}
10132 
10133 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10134 %{
10135   match(Set dst (IsInfiniteF src));
10136   effect(TEMP ktmp, KILL cr);
10137   format %{ "float_class_check $dst, $src" %}
10138   ins_encode %{
10139     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10140     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10141   %}
10142   ins_pipe(pipe_slow);
10143 %}
10144 
10145 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10146 %{
10147   match(Set dst (IsInfiniteD src));
10148   effect(TEMP ktmp, KILL cr);
10149   format %{ "double_class_check $dst, $src" %}
10150   ins_encode %{
10151     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10152     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10153   %}
10154   ins_pipe(pipe_slow);
10155 %}
10156 
10157