1 //
    2 // Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   C2_MacroAssembler _masm(&cbuf);
 1314   address base = __ start_a_stub(size_exception_handler());
 1315   if (base == nullptr) {
 1316     ciEnv::current()->record_failure("CodeCache is full");
 1317     return 0;  // CodeBuffer::expand failed
 1318   }
 1319   int offset = __ offset();
 1320   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1321   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1322   __ end_a_stub();
 1323   return offset;
 1324 }
 1325 
 1326 // Emit deopt handler code.
 1327 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1328 
 1329   // Note that the code buffer's insts_mark is always relative to insts.
 1330   // That's why we must use the macroassembler to generate a handler.
 1331   C2_MacroAssembler _masm(&cbuf);
 1332   address base = __ start_a_stub(size_deopt_handler());
 1333   if (base == nullptr) {
 1334     ciEnv::current()->record_failure("CodeCache is full");
 1335     return 0;  // CodeBuffer::expand failed
 1336   }
 1337   int offset = __ offset();
 1338 
 1339 #ifdef _LP64
 1340   address the_pc = (address) __ pc();
 1341   Label next;
 1342   // push a "the_pc" on the stack without destroying any registers
 1343   // as they all may be live.
 1344 
 1345   // push address of "next"
 1346   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1347   __ bind(next);
 1348   // adjust it so it matches "the_pc"
 1349   __ subptr(Address(rsp, 0), __ offset() - offset);
 1350 #else
 1351   InternalAddress here(__ pc());
 1352   __ pushptr(here.addr(), noreg);
 1353 #endif
 1354 
 1355   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1356   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1357   __ end_a_stub();
 1358   return offset;
 1359 }
 1360 
 1361 Assembler::Width widthForType(BasicType bt) {
 1362   if (bt == T_BYTE) {
 1363     return Assembler::B;
 1364   } else if (bt == T_SHORT) {
 1365     return Assembler::W;
 1366   } else if (bt == T_INT) {
 1367     return Assembler::D;
 1368   } else {
 1369     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1370     return Assembler::Q;
 1371   }
 1372 }
 1373 
 1374 //=============================================================================
 1375 
 1376   // Float masks come from different places depending on platform.
 1377 #ifdef _LP64
 1378   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1379   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1380   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1381   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1382 #else
 1383   static address float_signmask()  { return (address)float_signmask_pool; }
 1384   static address float_signflip()  { return (address)float_signflip_pool; }
 1385   static address double_signmask() { return (address)double_signmask_pool; }
 1386   static address double_signflip() { return (address)double_signflip_pool; }
 1387 #endif
 1388   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1389   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1390   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1391   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1392   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1393   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1394   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1395   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1396   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1397   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1398   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1399   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1400   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1401   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1402   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1403 
 1404 //=============================================================================
 1405 bool Matcher::match_rule_supported(int opcode) {
 1406   if (!has_match_rule(opcode)) {
 1407     return false; // no match rule present
 1408   }
 1409   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1410   switch (opcode) {
 1411     case Op_AbsVL:
 1412     case Op_StoreVectorScatter:
 1413       if (UseAVX < 3) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountI:
 1418     case Op_PopCountL:
 1419       if (!UsePopCountInstruction) {
 1420         return false;
 1421       }
 1422       break;
 1423     case Op_PopCountVI:
 1424       if (UseAVX < 2) {
 1425         return false;
 1426       }
 1427       break;
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572       if (UseAVX < 2) {
 1573         return false;
 1574       }
 1575       break;
 1576     case Op_FmaF:
 1577     case Op_FmaD:
 1578     case Op_FmaVD:
 1579     case Op_FmaVF:
 1580       if (!UseFMA) {
 1581         return false;
 1582       }
 1583       break;
 1584     case Op_MacroLogicV:
 1585       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1586         return false;
 1587       }
 1588       break;
 1589 
 1590     case Op_VectorCmpMasked:
 1591     case Op_VectorMaskGen:
 1592       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1593         return false;
 1594       }
 1595       break;
 1596     case Op_VectorMaskFirstTrue:
 1597     case Op_VectorMaskLastTrue:
 1598     case Op_VectorMaskTrueCount:
 1599     case Op_VectorMaskToLong:
 1600       if (!is_LP64 || UseAVX < 1) {
 1601          return false;
 1602       }
 1603       break;
 1604     case Op_RoundF:
 1605     case Op_RoundD:
 1606       if (!is_LP64) {
 1607         return false;
 1608       }
 1609       break;
 1610     case Op_CopySignD:
 1611     case Op_CopySignF:
 1612       if (UseAVX < 3 || !is_LP64)  {
 1613         return false;
 1614       }
 1615       if (!VM_Version::supports_avx512vl()) {
 1616         return false;
 1617       }
 1618       break;
 1619 #ifndef _LP64
 1620     case Op_AddReductionVF:
 1621     case Op_AddReductionVD:
 1622     case Op_MulReductionVF:
 1623     case Op_MulReductionVD:
 1624       if (UseSSE < 1) { // requires at least SSE
 1625         return false;
 1626       }
 1627       break;
 1628     case Op_MulAddVS2VI:
 1629     case Op_RShiftVL:
 1630     case Op_AbsVD:
 1631     case Op_NegVD:
 1632       if (UseSSE < 2) {
 1633         return false;
 1634       }
 1635       break;
 1636 #endif // !LP64
 1637     case Op_CompressBits:
 1638       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1639         return false;
 1640       }
 1641       break;
 1642     case Op_ExpandBits:
 1643       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1644         return false;
 1645       }
 1646       break;
 1647     case Op_SignumF:
 1648       if (UseSSE < 1) {
 1649         return false;
 1650       }
 1651       break;
 1652     case Op_SignumD:
 1653       if (UseSSE < 2) {
 1654         return false;
 1655       }
 1656       break;
 1657     case Op_CompressM:
 1658       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1659         return false;
 1660       }
 1661       break;
 1662     case Op_CompressV:
 1663     case Op_ExpandV:
 1664       if (!VM_Version::supports_avx512vl()) {
 1665         return false;
 1666       }
 1667       break;
 1668     case Op_SqrtF:
 1669       if (UseSSE < 1) {
 1670         return false;
 1671       }
 1672       break;
 1673     case Op_SqrtD:
 1674 #ifdef _LP64
 1675       if (UseSSE < 2) {
 1676         return false;
 1677       }
 1678 #else
 1679       // x86_32.ad has a special match rule for SqrtD.
 1680       // Together with common x86 rules, this handles all UseSSE cases.
 1681 #endif
 1682       break;
 1683     case Op_ConvF2HF:
 1684     case Op_ConvHF2F:
 1685       if (!VM_Version::supports_float16()) {
 1686         return false;
 1687       }
 1688       break;
 1689     case Op_VectorCastF2HF:
 1690     case Op_VectorCastHF2F:
 1691       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1692         return false;
 1693       }
 1694       break;
 1695   }
 1696   return true;  // Match rules are supported by default.
 1697 }
 1698 
 1699 //------------------------------------------------------------------------
 1700 
 1701 static inline bool is_pop_count_instr_target(BasicType bt) {
 1702   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1703          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1704 }
 1705 
 1706 bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) {
 1707   return match_rule_supported_vector(opcode, vlen, bt);
 1708 }
 1709 
 1710 // Identify extra cases that we might want to provide match rules for vector nodes and
 1711 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1712 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1713   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1714   if (!match_rule_supported(opcode)) {
 1715     return false;
 1716   }
 1717   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1718   //   * SSE2 supports 128bit vectors for all types;
 1719   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1720   //   * AVX2 supports 256bit vectors for all types;
 1721   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1722   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1723   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1724   // And MaxVectorSize is taken into account as well.
 1725   if (!vector_size_supported(bt, vlen)) {
 1726     return false;
 1727   }
 1728   // Special cases which require vector length follow:
 1729   //   * implementation limitations
 1730   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1731   //   * 128bit vroundpd instruction is present only in AVX1
 1732   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1733   switch (opcode) {
 1734     case Op_AbsVF:
 1735     case Op_NegVF:
 1736       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1737         return false; // 512bit vandps and vxorps are not available
 1738       }
 1739       break;
 1740     case Op_AbsVD:
 1741     case Op_NegVD:
 1742       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1743         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1744       }
 1745       break;
 1746     case Op_RotateRightV:
 1747     case Op_RotateLeftV:
 1748       if (bt != T_INT && bt != T_LONG) {
 1749         return false;
 1750       } // fallthrough
 1751     case Op_MacroLogicV:
 1752       if (!VM_Version::supports_evex() ||
 1753           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_ClearArray:
 1758     case Op_VectorMaskGen:
 1759     case Op_VectorCmpMasked:
 1760       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1761         return false;
 1762       }
 1763       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1764         return false;
 1765       }
 1766       break;
 1767     case Op_LoadVectorMasked:
 1768     case Op_StoreVectorMasked:
 1769       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1770         return false;
 1771       }
 1772       break;
 1773     case Op_MaxV:
 1774     case Op_MinV:
 1775       if (UseSSE < 4 && is_integral_type(bt)) {
 1776         return false;
 1777       }
 1778       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1779           // Float/Double intrinsics are enabled for AVX family currently.
 1780           if (UseAVX == 0) {
 1781             return false;
 1782           }
 1783           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1784             return false;
 1785           }
 1786       }
 1787       break;
 1788     case Op_CallLeafVector:
 1789       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1790         return false;
 1791       }
 1792       break;
 1793     case Op_AddReductionVI:
 1794       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1795         return false;
 1796       }
 1797       // fallthrough
 1798     case Op_AndReductionV:
 1799     case Op_OrReductionV:
 1800     case Op_XorReductionV:
 1801       if (is_subword_type(bt) && (UseSSE < 4)) {
 1802         return false;
 1803       }
 1804 #ifndef _LP64
 1805       if (bt == T_BYTE || bt == T_LONG) {
 1806         return false;
 1807       }
 1808 #endif
 1809       break;
 1810 #ifndef _LP64
 1811     case Op_VectorInsert:
 1812       if (bt == T_LONG || bt == T_DOUBLE) {
 1813         return false;
 1814       }
 1815       break;
 1816 #endif
 1817     case Op_MinReductionV:
 1818     case Op_MaxReductionV:
 1819       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1820         return false;
 1821       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1822         return false;
 1823       }
 1824       // Float/Double intrinsics enabled for AVX family.
 1825       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1826         return false;
 1827       }
 1828       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1829         return false;
 1830       }
 1831 #ifndef _LP64
 1832       if (bt == T_BYTE || bt == T_LONG) {
 1833         return false;
 1834       }
 1835 #endif
 1836       break;
 1837     case Op_VectorTest:
 1838       if (UseSSE < 4) {
 1839         return false; // Implementation limitation
 1840       } else if (size_in_bits < 32) {
 1841         return false; // Implementation limitation
 1842       }
 1843       break;
 1844     case Op_VectorLoadShuffle:
 1845     case Op_VectorRearrange:
 1846       if(vlen == 2) {
 1847         return false; // Implementation limitation due to how shuffle is loaded
 1848       } else if (size_in_bits == 256 && UseAVX < 2) {
 1849         return false; // Implementation limitation
 1850       }
 1851       break;
 1852     case Op_VectorLoadMask:
 1853     case Op_VectorMaskCast:
 1854       if (size_in_bits == 256 && UseAVX < 2) {
 1855         return false; // Implementation limitation
 1856       }
 1857       // fallthrough
 1858     case Op_VectorStoreMask:
 1859       if (vlen == 2) {
 1860         return false; // Implementation limitation
 1861       }
 1862       break;
 1863     case Op_PopulateIndex:
 1864       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1865         return false;
 1866       }
 1867       break;
 1868     case Op_VectorCastB2X:
 1869     case Op_VectorCastS2X:
 1870     case Op_VectorCastI2X:
 1871       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1872         return false;
 1873       }
 1874       break;
 1875     case Op_VectorCastL2X:
 1876       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1877         return false;
 1878       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1879         return false;
 1880       }
 1881       break;
 1882     case Op_VectorCastF2X: {
 1883         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1884         // happen after intermediate conversion to integer and special handling
 1885         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1886         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1887         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1888           return false;
 1889         }
 1890       }
 1891       // fallthrough
 1892     case Op_VectorCastD2X:
 1893       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1894         return false;
 1895       }
 1896       break;
 1897     case Op_VectorCastF2HF:
 1898     case Op_VectorCastHF2F:
 1899       if (!VM_Version::supports_f16c() &&
 1900          ((!VM_Version::supports_evex() ||
 1901          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1902         return false;
 1903       }
 1904       break;
 1905     case Op_RoundVD:
 1906       if (!VM_Version::supports_avx512dq()) {
 1907         return false;
 1908       }
 1909       break;
 1910     case Op_MulReductionVI:
 1911       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1912         return false;
 1913       }
 1914       break;
 1915     case Op_LoadVectorGatherMasked:
 1916     case Op_StoreVectorScatterMasked:
 1917     case Op_StoreVectorScatter:
 1918       if (is_subword_type(bt)) {
 1919         return false;
 1920       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1921         return false;
 1922       }
 1923       // fallthrough
 1924     case Op_LoadVectorGather:
 1925       if (size_in_bits == 64 ) {
 1926         return false;
 1927       }
 1928       break;
 1929     case Op_MaskAll:
 1930       if (!VM_Version::supports_evex()) {
 1931         return false;
 1932       }
 1933       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1934         return false;
 1935       }
 1936       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1937         return false;
 1938       }
 1939       break;
 1940     case Op_VectorMaskCmp:
 1941       if (vlen < 2 || size_in_bits < 32) {
 1942         return false;
 1943       }
 1944       break;
 1945     case Op_CompressM:
 1946       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1947         return false;
 1948       }
 1949       break;
 1950     case Op_CompressV:
 1951     case Op_ExpandV:
 1952       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1953         return false;
 1954       }
 1955       if (size_in_bits < 128 ) {
 1956         return false;
 1957       }
 1958       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1959         return false;
 1960       }
 1961       break;
 1962     case Op_VectorLongToMask:
 1963       if (UseAVX < 1 || !is_LP64) {
 1964         return false;
 1965       }
 1966       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1967         return false;
 1968       }
 1969       break;
 1970     case Op_SignumVD:
 1971     case Op_SignumVF:
 1972       if (UseAVX < 1) {
 1973         return false;
 1974       }
 1975       break;
 1976     case Op_PopCountVI:
 1977     case Op_PopCountVL: {
 1978         if (!is_pop_count_instr_target(bt) &&
 1979             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1980           return false;
 1981         }
 1982       }
 1983       break;
 1984     case Op_ReverseV:
 1985     case Op_ReverseBytesV:
 1986       if (UseAVX < 2) {
 1987         return false;
 1988       }
 1989       break;
 1990     case Op_CountTrailingZerosV:
 1991     case Op_CountLeadingZerosV:
 1992       if (UseAVX < 2) {
 1993         return false;
 1994       }
 1995       break;
 1996   }
 1997   return true;  // Per default match rules are supported.
 1998 }
 1999 
 2000 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2001   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2002   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2003   // of their non-masked counterpart with mask edge being the differentiator.
 2004   // This routine does a strict check on the existence of masked operation patterns
 2005   // by returning a default false value for all the other opcodes apart from the
 2006   // ones whose masked instruction patterns are defined in this file.
 2007   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2008     return false;
 2009   }
 2010 
 2011   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2012   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2013   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2014     return false;
 2015   }
 2016   switch(opcode) {
 2017     // Unary masked operations
 2018     case Op_AbsVB:
 2019     case Op_AbsVS:
 2020       if(!VM_Version::supports_avx512bw()) {
 2021         return false;  // Implementation limitation
 2022       }
 2023     case Op_AbsVI:
 2024     case Op_AbsVL:
 2025       return true;
 2026 
 2027     // Ternary masked operations
 2028     case Op_FmaVF:
 2029     case Op_FmaVD:
 2030       return true;
 2031 
 2032     case Op_MacroLogicV:
 2033       if(bt != T_INT && bt != T_LONG) {
 2034         return false;
 2035       }
 2036       return true;
 2037 
 2038     // Binary masked operations
 2039     case Op_AddVB:
 2040     case Op_AddVS:
 2041     case Op_SubVB:
 2042     case Op_SubVS:
 2043     case Op_MulVS:
 2044     case Op_LShiftVS:
 2045     case Op_RShiftVS:
 2046     case Op_URShiftVS:
 2047       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2048       if (!VM_Version::supports_avx512bw()) {
 2049         return false;  // Implementation limitation
 2050       }
 2051       return true;
 2052 
 2053     case Op_MulVL:
 2054       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2055       if (!VM_Version::supports_avx512dq()) {
 2056         return false;  // Implementation limitation
 2057       }
 2058       return true;
 2059 
 2060     case Op_AndV:
 2061     case Op_OrV:
 2062     case Op_XorV:
 2063     case Op_RotateRightV:
 2064     case Op_RotateLeftV:
 2065       if (bt != T_INT && bt != T_LONG) {
 2066         return false; // Implementation limitation
 2067       }
 2068       return true;
 2069 
 2070     case Op_VectorLoadMask:
 2071       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2072       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2073         return false;
 2074       }
 2075       return true;
 2076 
 2077     case Op_AddVI:
 2078     case Op_AddVL:
 2079     case Op_AddVF:
 2080     case Op_AddVD:
 2081     case Op_SubVI:
 2082     case Op_SubVL:
 2083     case Op_SubVF:
 2084     case Op_SubVD:
 2085     case Op_MulVI:
 2086     case Op_MulVF:
 2087     case Op_MulVD:
 2088     case Op_DivVF:
 2089     case Op_DivVD:
 2090     case Op_SqrtVF:
 2091     case Op_SqrtVD:
 2092     case Op_LShiftVI:
 2093     case Op_LShiftVL:
 2094     case Op_RShiftVI:
 2095     case Op_RShiftVL:
 2096     case Op_URShiftVI:
 2097     case Op_URShiftVL:
 2098     case Op_LoadVectorMasked:
 2099     case Op_StoreVectorMasked:
 2100     case Op_LoadVectorGatherMasked:
 2101     case Op_StoreVectorScatterMasked:
 2102       return true;
 2103 
 2104     case Op_MaxV:
 2105     case Op_MinV:
 2106       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2107         return false; // Implementation limitation
 2108       }
 2109       if (is_floating_point_type(bt)) {
 2110         return false; // Implementation limitation
 2111       }
 2112       return true;
 2113 
 2114     case Op_VectorMaskCmp:
 2115       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2116         return false; // Implementation limitation
 2117       }
 2118       return true;
 2119 
 2120     case Op_VectorRearrange:
 2121       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2122         return false; // Implementation limitation
 2123       }
 2124       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2125         return false; // Implementation limitation
 2126       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2127         return false; // Implementation limitation
 2128       }
 2129       return true;
 2130 
 2131     // Binary Logical operations
 2132     case Op_AndVMask:
 2133     case Op_OrVMask:
 2134     case Op_XorVMask:
 2135       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2136         return false; // Implementation limitation
 2137       }
 2138       return true;
 2139 
 2140     case Op_PopCountVI:
 2141     case Op_PopCountVL:
 2142       if (!is_pop_count_instr_target(bt)) {
 2143         return false;
 2144       }
 2145       return true;
 2146 
 2147     case Op_MaskAll:
 2148       return true;
 2149 
 2150     case Op_CountLeadingZerosV:
 2151       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2152         return true;
 2153       }
 2154     default:
 2155       return false;
 2156   }
 2157 }
 2158 
 2159 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2160   return false;
 2161 }
 2162 
 2163 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2164   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2165   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2166   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2167       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2168     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2169     return new legVecZOper();
 2170   }
 2171   if (legacy) {
 2172     switch (ideal_reg) {
 2173       case Op_VecS: return new legVecSOper();
 2174       case Op_VecD: return new legVecDOper();
 2175       case Op_VecX: return new legVecXOper();
 2176       case Op_VecY: return new legVecYOper();
 2177       case Op_VecZ: return new legVecZOper();
 2178     }
 2179   } else {
 2180     switch (ideal_reg) {
 2181       case Op_VecS: return new vecSOper();
 2182       case Op_VecD: return new vecDOper();
 2183       case Op_VecX: return new vecXOper();
 2184       case Op_VecY: return new vecYOper();
 2185       case Op_VecZ: return new vecZOper();
 2186     }
 2187   }
 2188   ShouldNotReachHere();
 2189   return nullptr;
 2190 }
 2191 
 2192 bool Matcher::is_reg2reg_move(MachNode* m) {
 2193   switch (m->rule()) {
 2194     case MoveVec2Leg_rule:
 2195     case MoveLeg2Vec_rule:
 2196     case MoveF2VL_rule:
 2197     case MoveF2LEG_rule:
 2198     case MoveVL2F_rule:
 2199     case MoveLEG2F_rule:
 2200     case MoveD2VL_rule:
 2201     case MoveD2LEG_rule:
 2202     case MoveVL2D_rule:
 2203     case MoveLEG2D_rule:
 2204       return true;
 2205     default:
 2206       return false;
 2207   }
 2208 }
 2209 
 2210 bool Matcher::is_generic_vector(MachOper* opnd) {
 2211   switch (opnd->opcode()) {
 2212     case VEC:
 2213     case LEGVEC:
 2214       return true;
 2215     default:
 2216       return false;
 2217   }
 2218 }
 2219 
 2220 //------------------------------------------------------------------------
 2221 
 2222 const RegMask* Matcher::predicate_reg_mask(void) {
 2223   return &_VECTMASK_REG_mask;
 2224 }
 2225 
 2226 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2227   return new TypeVectMask(elemTy, length);
 2228 }
 2229 
 2230 // Max vector size in bytes. 0 if not supported.
 2231 int Matcher::vector_width_in_bytes(BasicType bt) {
 2232   assert(is_java_primitive(bt), "only primitive type vectors");
 2233   if (UseSSE < 2) return 0;
 2234   // SSE2 supports 128bit vectors for all types.
 2235   // AVX2 supports 256bit vectors for all types.
 2236   // AVX2/EVEX supports 512bit vectors for all types.
 2237   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2238   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2239   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2240     size = (UseAVX > 2) ? 64 : 32;
 2241   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2242     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2243   // Use flag to limit vector size.
 2244   size = MIN2(size,(int)MaxVectorSize);
 2245   // Minimum 2 values in vector (or 4 for bytes).
 2246   switch (bt) {
 2247   case T_DOUBLE:
 2248   case T_LONG:
 2249     if (size < 16) return 0;
 2250     break;
 2251   case T_FLOAT:
 2252   case T_INT:
 2253     if (size < 8) return 0;
 2254     break;
 2255   case T_BOOLEAN:
 2256     if (size < 4) return 0;
 2257     break;
 2258   case T_CHAR:
 2259     if (size < 4) return 0;
 2260     break;
 2261   case T_BYTE:
 2262     if (size < 4) return 0;
 2263     break;
 2264   case T_SHORT:
 2265     if (size < 4) return 0;
 2266     break;
 2267   default:
 2268     ShouldNotReachHere();
 2269   }
 2270   return size;
 2271 }
 2272 
 2273 // Limits on vector size (number of elements) loaded into vector.
 2274 int Matcher::max_vector_size(const BasicType bt) {
 2275   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2276 }
 2277 int Matcher::min_vector_size(const BasicType bt) {
 2278   int max_size = max_vector_size(bt);
 2279   // Min size which can be loaded into vector is 4 bytes.
 2280   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2281   // Support for calling svml double64 vectors
 2282   if (bt == T_DOUBLE) {
 2283     size = 1;
 2284   }
 2285   return MIN2(size,max_size);
 2286 }
 2287 
 2288 int Matcher::superword_max_vector_size(const BasicType bt) {
 2289   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2290   // by default on Cascade Lake
 2291   if (VM_Version::is_default_intel_cascade_lake()) {
 2292     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2293   }
 2294   return Matcher::max_vector_size(bt);
 2295 }
 2296 
 2297 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2298   return -1;
 2299 }
 2300 
 2301 // Vector ideal reg corresponding to specified size in bytes
 2302 uint Matcher::vector_ideal_reg(int size) {
 2303   assert(MaxVectorSize >= size, "");
 2304   switch(size) {
 2305     case  4: return Op_VecS;
 2306     case  8: return Op_VecD;
 2307     case 16: return Op_VecX;
 2308     case 32: return Op_VecY;
 2309     case 64: return Op_VecZ;
 2310   }
 2311   ShouldNotReachHere();
 2312   return 0;
 2313 }
 2314 
 2315 // Check for shift by small constant as well
 2316 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2317   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2318       shift->in(2)->get_int() <= 3 &&
 2319       // Are there other uses besides address expressions?
 2320       !matcher->is_visited(shift)) {
 2321     address_visited.set(shift->_idx); // Flag as address_visited
 2322     mstack.push(shift->in(2), Matcher::Visit);
 2323     Node *conv = shift->in(1);
 2324 #ifdef _LP64
 2325     // Allow Matcher to match the rule which bypass
 2326     // ConvI2L operation for an array index on LP64
 2327     // if the index value is positive.
 2328     if (conv->Opcode() == Op_ConvI2L &&
 2329         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2330         // Are there other uses besides address expressions?
 2331         !matcher->is_visited(conv)) {
 2332       address_visited.set(conv->_idx); // Flag as address_visited
 2333       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2334     } else
 2335 #endif
 2336       mstack.push(conv, Matcher::Pre_Visit);
 2337     return true;
 2338   }
 2339   return false;
 2340 }
 2341 
 2342 // This function identifies sub-graphs in which a 'load' node is
 2343 // input to two different nodes, and such that it can be matched
 2344 // with BMI instructions like blsi, blsr, etc.
 2345 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2346 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2347 // refers to the same node.
 2348 //
 2349 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2350 // This is a temporary solution until we make DAGs expressible in ADL.
 2351 template<typename ConType>
 2352 class FusedPatternMatcher {
 2353   Node* _op1_node;
 2354   Node* _mop_node;
 2355   int _con_op;
 2356 
 2357   static int match_next(Node* n, int next_op, int next_op_idx) {
 2358     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2359       return -1;
 2360     }
 2361 
 2362     if (next_op_idx == -1) { // n is commutative, try rotations
 2363       if (n->in(1)->Opcode() == next_op) {
 2364         return 1;
 2365       } else if (n->in(2)->Opcode() == next_op) {
 2366         return 2;
 2367       }
 2368     } else {
 2369       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2370       if (n->in(next_op_idx)->Opcode() == next_op) {
 2371         return next_op_idx;
 2372       }
 2373     }
 2374     return -1;
 2375   }
 2376 
 2377  public:
 2378   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2379     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2380 
 2381   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2382              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2383              typename ConType::NativeType con_value) {
 2384     if (_op1_node->Opcode() != op1) {
 2385       return false;
 2386     }
 2387     if (_mop_node->outcnt() > 2) {
 2388       return false;
 2389     }
 2390     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2391     if (op1_op2_idx == -1) {
 2392       return false;
 2393     }
 2394     // Memory operation must be the other edge
 2395     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2396 
 2397     // Check that the mop node is really what we want
 2398     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2399       Node* op2_node = _op1_node->in(op1_op2_idx);
 2400       if (op2_node->outcnt() > 1) {
 2401         return false;
 2402       }
 2403       assert(op2_node->Opcode() == op2, "Should be");
 2404       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2405       if (op2_con_idx == -1) {
 2406         return false;
 2407       }
 2408       // Memory operation must be the other edge
 2409       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2410       // Check that the memory operation is the same node
 2411       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2412         // Now check the constant
 2413         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2414         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2415           return true;
 2416         }
 2417       }
 2418     }
 2419     return false;
 2420   }
 2421 };
 2422 
 2423 static bool is_bmi_pattern(Node* n, Node* m) {
 2424   assert(UseBMI1Instructions, "sanity");
 2425   if (n != nullptr && m != nullptr) {
 2426     if (m->Opcode() == Op_LoadI) {
 2427       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2428       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2429              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2430              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2431     } else if (m->Opcode() == Op_LoadL) {
 2432       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2433       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2434              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2435              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2436     }
 2437   }
 2438   return false;
 2439 }
 2440 
 2441 // Should the matcher clone input 'm' of node 'n'?
 2442 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2443   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2444   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2445     mstack.push(m, Visit);
 2446     return true;
 2447   }
 2448   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2449     mstack.push(m, Visit);           // m = ShiftCntV
 2450     return true;
 2451   }
 2452   return false;
 2453 }
 2454 
 2455 // Should the Matcher clone shifts on addressing modes, expecting them
 2456 // to be subsumed into complex addressing expressions or compute them
 2457 // into registers?
 2458 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2459   Node *off = m->in(AddPNode::Offset);
 2460   if (off->is_Con()) {
 2461     address_visited.test_set(m->_idx); // Flag as address_visited
 2462     Node *adr = m->in(AddPNode::Address);
 2463 
 2464     // Intel can handle 2 adds in addressing mode
 2465     // AtomicAdd is not an addressing expression.
 2466     // Cheap to find it by looking for screwy base.
 2467     if (adr->is_AddP() &&
 2468         !adr->in(AddPNode::Base)->is_top() &&
 2469         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2470         // Are there other uses besides address expressions?
 2471         !is_visited(adr)) {
 2472       address_visited.set(adr->_idx); // Flag as address_visited
 2473       Node *shift = adr->in(AddPNode::Offset);
 2474       if (!clone_shift(shift, this, mstack, address_visited)) {
 2475         mstack.push(shift, Pre_Visit);
 2476       }
 2477       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2478       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2479     } else {
 2480       mstack.push(adr, Pre_Visit);
 2481     }
 2482 
 2483     // Clone X+offset as it also folds into most addressing expressions
 2484     mstack.push(off, Visit);
 2485     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2486     return true;
 2487   } else if (clone_shift(off, this, mstack, address_visited)) {
 2488     address_visited.test_set(m->_idx); // Flag as address_visited
 2489     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2490     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2491     return true;
 2492   }
 2493   return false;
 2494 }
 2495 
 2496 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2497   switch (bt) {
 2498     case BoolTest::eq:
 2499       return Assembler::eq;
 2500     case BoolTest::ne:
 2501       return Assembler::neq;
 2502     case BoolTest::le:
 2503     case BoolTest::ule:
 2504       return Assembler::le;
 2505     case BoolTest::ge:
 2506     case BoolTest::uge:
 2507       return Assembler::nlt;
 2508     case BoolTest::lt:
 2509     case BoolTest::ult:
 2510       return Assembler::lt;
 2511     case BoolTest::gt:
 2512     case BoolTest::ugt:
 2513       return Assembler::nle;
 2514     default : ShouldNotReachHere(); return Assembler::_false;
 2515   }
 2516 }
 2517 
 2518 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2519   switch (bt) {
 2520   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2521   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2522   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2523   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2524   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2525   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2526   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2527   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2528   }
 2529 }
 2530 
 2531 // Helper methods for MachSpillCopyNode::implementation().
 2532 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2533                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2534   assert(ireg == Op_VecS || // 32bit vector
 2535          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2536          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2537          "no non-adjacent vector moves" );
 2538   if (cbuf) {
 2539     C2_MacroAssembler _masm(cbuf);
 2540     switch (ireg) {
 2541     case Op_VecS: // copy whole register
 2542     case Op_VecD:
 2543     case Op_VecX:
 2544 #ifndef _LP64
 2545       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2546 #else
 2547       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2548         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2549       } else {
 2550         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2551      }
 2552 #endif
 2553       break;
 2554     case Op_VecY:
 2555 #ifndef _LP64
 2556       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2557 #else
 2558       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2559         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2560       } else {
 2561         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2562      }
 2563 #endif
 2564       break;
 2565     case Op_VecZ:
 2566       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2567       break;
 2568     default:
 2569       ShouldNotReachHere();
 2570     }
 2571 #ifndef PRODUCT
 2572   } else {
 2573     switch (ireg) {
 2574     case Op_VecS:
 2575     case Op_VecD:
 2576     case Op_VecX:
 2577       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2578       break;
 2579     case Op_VecY:
 2580     case Op_VecZ:
 2581       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2582       break;
 2583     default:
 2584       ShouldNotReachHere();
 2585     }
 2586 #endif
 2587   }
 2588 }
 2589 
 2590 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2591                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2592   if (cbuf) {
 2593     C2_MacroAssembler _masm(cbuf);
 2594     if (is_load) {
 2595       switch (ireg) {
 2596       case Op_VecS:
 2597         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2598         break;
 2599       case Op_VecD:
 2600         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2601         break;
 2602       case Op_VecX:
 2603 #ifndef _LP64
 2604         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2605 #else
 2606         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2607           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2608         } else {
 2609           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2610           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2611         }
 2612 #endif
 2613         break;
 2614       case Op_VecY:
 2615 #ifndef _LP64
 2616         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2617 #else
 2618         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2619           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2620         } else {
 2621           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2622           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2623         }
 2624 #endif
 2625         break;
 2626       case Op_VecZ:
 2627         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2628         break;
 2629       default:
 2630         ShouldNotReachHere();
 2631       }
 2632     } else { // store
 2633       switch (ireg) {
 2634       case Op_VecS:
 2635         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2636         break;
 2637       case Op_VecD:
 2638         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2639         break;
 2640       case Op_VecX:
 2641 #ifndef _LP64
 2642         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2643 #else
 2644         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2645           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2646         }
 2647         else {
 2648           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2649         }
 2650 #endif
 2651         break;
 2652       case Op_VecY:
 2653 #ifndef _LP64
 2654         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2655 #else
 2656         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2657           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2658         }
 2659         else {
 2660           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2661         }
 2662 #endif
 2663         break;
 2664       case Op_VecZ:
 2665         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2666         break;
 2667       default:
 2668         ShouldNotReachHere();
 2669       }
 2670     }
 2671 #ifndef PRODUCT
 2672   } else {
 2673     if (is_load) {
 2674       switch (ireg) {
 2675       case Op_VecS:
 2676         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2677         break;
 2678       case Op_VecD:
 2679         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2680         break;
 2681        case Op_VecX:
 2682         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2683         break;
 2684       case Op_VecY:
 2685       case Op_VecZ:
 2686         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2687         break;
 2688       default:
 2689         ShouldNotReachHere();
 2690       }
 2691     } else { // store
 2692       switch (ireg) {
 2693       case Op_VecS:
 2694         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2695         break;
 2696       case Op_VecD:
 2697         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2698         break;
 2699        case Op_VecX:
 2700         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2701         break;
 2702       case Op_VecY:
 2703       case Op_VecZ:
 2704         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2705         break;
 2706       default:
 2707         ShouldNotReachHere();
 2708       }
 2709     }
 2710 #endif
 2711   }
 2712 }
 2713 
 2714 template <class T>
 2715 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2716   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2717   jvalue ele;
 2718   switch (bt) {
 2719     case T_BYTE:   ele.b = con; break;
 2720     case T_SHORT:  ele.s = con; break;
 2721     case T_INT:    ele.i = con; break;
 2722     case T_LONG:   ele.j = con; break;
 2723     case T_FLOAT:  ele.f = con; break;
 2724     case T_DOUBLE: ele.d = con; break;
 2725     default: ShouldNotReachHere();
 2726   }
 2727   for (int i = 0; i < len; i++) {
 2728     val->append(ele);
 2729   }
 2730   return val;
 2731 }
 2732 
 2733 static inline jlong high_bit_set(BasicType bt) {
 2734   switch (bt) {
 2735     case T_BYTE:  return 0x8080808080808080;
 2736     case T_SHORT: return 0x8000800080008000;
 2737     case T_INT:   return 0x8000000080000000;
 2738     case T_LONG:  return 0x8000000000000000;
 2739     default:
 2740       ShouldNotReachHere();
 2741       return 0;
 2742   }
 2743 }
 2744 
 2745 #ifndef PRODUCT
 2746   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2747     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2748   }
 2749 #endif
 2750 
 2751   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2752     C2_MacroAssembler _masm(&cbuf);
 2753     __ nop(_count);
 2754   }
 2755 
 2756   uint MachNopNode::size(PhaseRegAlloc*) const {
 2757     return _count;
 2758   }
 2759 
 2760 #ifndef PRODUCT
 2761   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2762     st->print("# breakpoint");
 2763   }
 2764 #endif
 2765 
 2766   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2767     C2_MacroAssembler _masm(&cbuf);
 2768     __ int3();
 2769   }
 2770 
 2771   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2772     return MachNode::size(ra_);
 2773   }
 2774 
 2775 %}
 2776 
 2777 encode %{
 2778 
 2779   enc_class call_epilog %{
 2780     C2_MacroAssembler _masm(&cbuf);
 2781     if (VerifyStackAtCalls) {
 2782       // Check that stack depth is unchanged: find majik cookie on stack
 2783       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2784       Label L;
 2785       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2786       __ jccb(Assembler::equal, L);
 2787       // Die if stack mismatch
 2788       __ int3();
 2789       __ bind(L);
 2790     }
 2791     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2792       C2_MacroAssembler _masm(&cbuf);
 2793       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2794       // Search for the corresponding projection, get the register and emit code that initialized it.
 2795       uint con = (tf()->range_cc()->cnt() - 1);
 2796       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2797         ProjNode* proj = fast_out(i)->as_Proj();
 2798         if (proj->_con == con) {
 2799           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2800           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2801           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2802           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2803           __ testq(rax, rax);
 2804           __ setb(Assembler::notZero, toReg);
 2805           __ movzbl(toReg, toReg);
 2806           if (reg->is_stack()) {
 2807             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2808             __ movq(Address(rsp, st_off), toReg);
 2809           }
 2810           break;
 2811         }
 2812       }
 2813       if (return_value_is_used()) {
 2814         // An inline type is returned as fields in multiple registers.
 2815         // Rax either contains an oop if the inline type is buffered or a pointer
 2816         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2817         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2818         // rax &= (rax & 1) - 1
 2819         __ movptr(rscratch1, rax);
 2820         __ andptr(rscratch1, 0x1);
 2821         __ subptr(rscratch1, 0x1);
 2822         __ andptr(rax, rscratch1);
 2823       }
 2824     }
 2825   %}
 2826 
 2827 %}
 2828 
 2829 // Operands for bound floating pointer register arguments
 2830 operand rxmm0() %{
 2831   constraint(ALLOC_IN_RC(xmm0_reg));
 2832   match(VecX);
 2833   format%{%}
 2834   interface(REG_INTER);
 2835 %}
 2836 
 2837 //----------OPERANDS-----------------------------------------------------------
 2838 // Operand definitions must precede instruction definitions for correct parsing
 2839 // in the ADLC because operands constitute user defined types which are used in
 2840 // instruction definitions.
 2841 
 2842 // Vectors
 2843 
 2844 // Dummy generic vector class. Should be used for all vector operands.
 2845 // Replaced with vec[SDXYZ] during post-selection pass.
 2846 operand vec() %{
 2847   constraint(ALLOC_IN_RC(dynamic));
 2848   match(VecX);
 2849   match(VecY);
 2850   match(VecZ);
 2851   match(VecS);
 2852   match(VecD);
 2853 
 2854   format %{ %}
 2855   interface(REG_INTER);
 2856 %}
 2857 
 2858 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2859 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2860 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2861 // runtime code generation via reg_class_dynamic.
 2862 operand legVec() %{
 2863   constraint(ALLOC_IN_RC(dynamic));
 2864   match(VecX);
 2865   match(VecY);
 2866   match(VecZ);
 2867   match(VecS);
 2868   match(VecD);
 2869 
 2870   format %{ %}
 2871   interface(REG_INTER);
 2872 %}
 2873 
 2874 // Replaces vec during post-selection cleanup. See above.
 2875 operand vecS() %{
 2876   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2877   match(VecS);
 2878 
 2879   format %{ %}
 2880   interface(REG_INTER);
 2881 %}
 2882 
 2883 // Replaces legVec during post-selection cleanup. See above.
 2884 operand legVecS() %{
 2885   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2886   match(VecS);
 2887 
 2888   format %{ %}
 2889   interface(REG_INTER);
 2890 %}
 2891 
 2892 // Replaces vec during post-selection cleanup. See above.
 2893 operand vecD() %{
 2894   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2895   match(VecD);
 2896 
 2897   format %{ %}
 2898   interface(REG_INTER);
 2899 %}
 2900 
 2901 // Replaces legVec during post-selection cleanup. See above.
 2902 operand legVecD() %{
 2903   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2904   match(VecD);
 2905 
 2906   format %{ %}
 2907   interface(REG_INTER);
 2908 %}
 2909 
 2910 // Replaces vec during post-selection cleanup. See above.
 2911 operand vecX() %{
 2912   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2913   match(VecX);
 2914 
 2915   format %{ %}
 2916   interface(REG_INTER);
 2917 %}
 2918 
 2919 // Replaces legVec during post-selection cleanup. See above.
 2920 operand legVecX() %{
 2921   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2922   match(VecX);
 2923 
 2924   format %{ %}
 2925   interface(REG_INTER);
 2926 %}
 2927 
 2928 // Replaces vec during post-selection cleanup. See above.
 2929 operand vecY() %{
 2930   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2931   match(VecY);
 2932 
 2933   format %{ %}
 2934   interface(REG_INTER);
 2935 %}
 2936 
 2937 // Replaces legVec during post-selection cleanup. See above.
 2938 operand legVecY() %{
 2939   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2940   match(VecY);
 2941 
 2942   format %{ %}
 2943   interface(REG_INTER);
 2944 %}
 2945 
 2946 // Replaces vec during post-selection cleanup. See above.
 2947 operand vecZ() %{
 2948   constraint(ALLOC_IN_RC(vectorz_reg));
 2949   match(VecZ);
 2950 
 2951   format %{ %}
 2952   interface(REG_INTER);
 2953 %}
 2954 
 2955 // Replaces legVec during post-selection cleanup. See above.
 2956 operand legVecZ() %{
 2957   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2958   match(VecZ);
 2959 
 2960   format %{ %}
 2961   interface(REG_INTER);
 2962 %}
 2963 
 2964 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2965 
 2966 // ============================================================================
 2967 
 2968 instruct ShouldNotReachHere() %{
 2969   match(Halt);
 2970   format %{ "stop\t# ShouldNotReachHere" %}
 2971   ins_encode %{
 2972     if (is_reachable()) {
 2973       __ stop(_halt_reason);
 2974     }
 2975   %}
 2976   ins_pipe(pipe_slow);
 2977 %}
 2978 
 2979 // ============================================================================
 2980 
 2981 instruct addF_reg(regF dst, regF src) %{
 2982   predicate((UseSSE>=1) && (UseAVX == 0));
 2983   match(Set dst (AddF dst src));
 2984 
 2985   format %{ "addss   $dst, $src" %}
 2986   ins_cost(150);
 2987   ins_encode %{
 2988     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2989   %}
 2990   ins_pipe(pipe_slow);
 2991 %}
 2992 
 2993 instruct addF_mem(regF dst, memory src) %{
 2994   predicate((UseSSE>=1) && (UseAVX == 0));
 2995   match(Set dst (AddF dst (LoadF src)));
 2996 
 2997   format %{ "addss   $dst, $src" %}
 2998   ins_cost(150);
 2999   ins_encode %{
 3000     __ addss($dst$$XMMRegister, $src$$Address);
 3001   %}
 3002   ins_pipe(pipe_slow);
 3003 %}
 3004 
 3005 instruct addF_imm(regF dst, immF con) %{
 3006   predicate((UseSSE>=1) && (UseAVX == 0));
 3007   match(Set dst (AddF dst con));
 3008   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3009   ins_cost(150);
 3010   ins_encode %{
 3011     __ addss($dst$$XMMRegister, $constantaddress($con));
 3012   %}
 3013   ins_pipe(pipe_slow);
 3014 %}
 3015 
 3016 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3017   predicate(UseAVX > 0);
 3018   match(Set dst (AddF src1 src2));
 3019 
 3020   format %{ "vaddss  $dst, $src1, $src2" %}
 3021   ins_cost(150);
 3022   ins_encode %{
 3023     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3024   %}
 3025   ins_pipe(pipe_slow);
 3026 %}
 3027 
 3028 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3029   predicate(UseAVX > 0);
 3030   match(Set dst (AddF src1 (LoadF src2)));
 3031 
 3032   format %{ "vaddss  $dst, $src1, $src2" %}
 3033   ins_cost(150);
 3034   ins_encode %{
 3035     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3036   %}
 3037   ins_pipe(pipe_slow);
 3038 %}
 3039 
 3040 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3041   predicate(UseAVX > 0);
 3042   match(Set dst (AddF src con));
 3043 
 3044   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3045   ins_cost(150);
 3046   ins_encode %{
 3047     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3048   %}
 3049   ins_pipe(pipe_slow);
 3050 %}
 3051 
 3052 instruct addD_reg(regD dst, regD src) %{
 3053   predicate((UseSSE>=2) && (UseAVX == 0));
 3054   match(Set dst (AddD dst src));
 3055 
 3056   format %{ "addsd   $dst, $src" %}
 3057   ins_cost(150);
 3058   ins_encode %{
 3059     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3060   %}
 3061   ins_pipe(pipe_slow);
 3062 %}
 3063 
 3064 instruct addD_mem(regD dst, memory src) %{
 3065   predicate((UseSSE>=2) && (UseAVX == 0));
 3066   match(Set dst (AddD dst (LoadD src)));
 3067 
 3068   format %{ "addsd   $dst, $src" %}
 3069   ins_cost(150);
 3070   ins_encode %{
 3071     __ addsd($dst$$XMMRegister, $src$$Address);
 3072   %}
 3073   ins_pipe(pipe_slow);
 3074 %}
 3075 
 3076 instruct addD_imm(regD dst, immD con) %{
 3077   predicate((UseSSE>=2) && (UseAVX == 0));
 3078   match(Set dst (AddD dst con));
 3079   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3080   ins_cost(150);
 3081   ins_encode %{
 3082     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3083   %}
 3084   ins_pipe(pipe_slow);
 3085 %}
 3086 
 3087 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3088   predicate(UseAVX > 0);
 3089   match(Set dst (AddD src1 src2));
 3090 
 3091   format %{ "vaddsd  $dst, $src1, $src2" %}
 3092   ins_cost(150);
 3093   ins_encode %{
 3094     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3095   %}
 3096   ins_pipe(pipe_slow);
 3097 %}
 3098 
 3099 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3100   predicate(UseAVX > 0);
 3101   match(Set dst (AddD src1 (LoadD src2)));
 3102 
 3103   format %{ "vaddsd  $dst, $src1, $src2" %}
 3104   ins_cost(150);
 3105   ins_encode %{
 3106     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3107   %}
 3108   ins_pipe(pipe_slow);
 3109 %}
 3110 
 3111 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3112   predicate(UseAVX > 0);
 3113   match(Set dst (AddD src con));
 3114 
 3115   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3116   ins_cost(150);
 3117   ins_encode %{
 3118     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3119   %}
 3120   ins_pipe(pipe_slow);
 3121 %}
 3122 
 3123 instruct subF_reg(regF dst, regF src) %{
 3124   predicate((UseSSE>=1) && (UseAVX == 0));
 3125   match(Set dst (SubF dst src));
 3126 
 3127   format %{ "subss   $dst, $src" %}
 3128   ins_cost(150);
 3129   ins_encode %{
 3130     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3131   %}
 3132   ins_pipe(pipe_slow);
 3133 %}
 3134 
 3135 instruct subF_mem(regF dst, memory src) %{
 3136   predicate((UseSSE>=1) && (UseAVX == 0));
 3137   match(Set dst (SubF dst (LoadF src)));
 3138 
 3139   format %{ "subss   $dst, $src" %}
 3140   ins_cost(150);
 3141   ins_encode %{
 3142     __ subss($dst$$XMMRegister, $src$$Address);
 3143   %}
 3144   ins_pipe(pipe_slow);
 3145 %}
 3146 
 3147 instruct subF_imm(regF dst, immF con) %{
 3148   predicate((UseSSE>=1) && (UseAVX == 0));
 3149   match(Set dst (SubF dst con));
 3150   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3151   ins_cost(150);
 3152   ins_encode %{
 3153     __ subss($dst$$XMMRegister, $constantaddress($con));
 3154   %}
 3155   ins_pipe(pipe_slow);
 3156 %}
 3157 
 3158 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3159   predicate(UseAVX > 0);
 3160   match(Set dst (SubF src1 src2));
 3161 
 3162   format %{ "vsubss  $dst, $src1, $src2" %}
 3163   ins_cost(150);
 3164   ins_encode %{
 3165     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3166   %}
 3167   ins_pipe(pipe_slow);
 3168 %}
 3169 
 3170 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3171   predicate(UseAVX > 0);
 3172   match(Set dst (SubF src1 (LoadF src2)));
 3173 
 3174   format %{ "vsubss  $dst, $src1, $src2" %}
 3175   ins_cost(150);
 3176   ins_encode %{
 3177     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3178   %}
 3179   ins_pipe(pipe_slow);
 3180 %}
 3181 
 3182 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3183   predicate(UseAVX > 0);
 3184   match(Set dst (SubF src con));
 3185 
 3186   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3187   ins_cost(150);
 3188   ins_encode %{
 3189     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3190   %}
 3191   ins_pipe(pipe_slow);
 3192 %}
 3193 
 3194 instruct subD_reg(regD dst, regD src) %{
 3195   predicate((UseSSE>=2) && (UseAVX == 0));
 3196   match(Set dst (SubD dst src));
 3197 
 3198   format %{ "subsd   $dst, $src" %}
 3199   ins_cost(150);
 3200   ins_encode %{
 3201     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3202   %}
 3203   ins_pipe(pipe_slow);
 3204 %}
 3205 
 3206 instruct subD_mem(regD dst, memory src) %{
 3207   predicate((UseSSE>=2) && (UseAVX == 0));
 3208   match(Set dst (SubD dst (LoadD src)));
 3209 
 3210   format %{ "subsd   $dst, $src" %}
 3211   ins_cost(150);
 3212   ins_encode %{
 3213     __ subsd($dst$$XMMRegister, $src$$Address);
 3214   %}
 3215   ins_pipe(pipe_slow);
 3216 %}
 3217 
 3218 instruct subD_imm(regD dst, immD con) %{
 3219   predicate((UseSSE>=2) && (UseAVX == 0));
 3220   match(Set dst (SubD dst con));
 3221   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3222   ins_cost(150);
 3223   ins_encode %{
 3224     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3225   %}
 3226   ins_pipe(pipe_slow);
 3227 %}
 3228 
 3229 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3230   predicate(UseAVX > 0);
 3231   match(Set dst (SubD src1 src2));
 3232 
 3233   format %{ "vsubsd  $dst, $src1, $src2" %}
 3234   ins_cost(150);
 3235   ins_encode %{
 3236     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3237   %}
 3238   ins_pipe(pipe_slow);
 3239 %}
 3240 
 3241 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3242   predicate(UseAVX > 0);
 3243   match(Set dst (SubD src1 (LoadD src2)));
 3244 
 3245   format %{ "vsubsd  $dst, $src1, $src2" %}
 3246   ins_cost(150);
 3247   ins_encode %{
 3248     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3249   %}
 3250   ins_pipe(pipe_slow);
 3251 %}
 3252 
 3253 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3254   predicate(UseAVX > 0);
 3255   match(Set dst (SubD src con));
 3256 
 3257   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3258   ins_cost(150);
 3259   ins_encode %{
 3260     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3261   %}
 3262   ins_pipe(pipe_slow);
 3263 %}
 3264 
 3265 instruct mulF_reg(regF dst, regF src) %{
 3266   predicate((UseSSE>=1) && (UseAVX == 0));
 3267   match(Set dst (MulF dst src));
 3268 
 3269   format %{ "mulss   $dst, $src" %}
 3270   ins_cost(150);
 3271   ins_encode %{
 3272     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3273   %}
 3274   ins_pipe(pipe_slow);
 3275 %}
 3276 
 3277 instruct mulF_mem(regF dst, memory src) %{
 3278   predicate((UseSSE>=1) && (UseAVX == 0));
 3279   match(Set dst (MulF dst (LoadF src)));
 3280 
 3281   format %{ "mulss   $dst, $src" %}
 3282   ins_cost(150);
 3283   ins_encode %{
 3284     __ mulss($dst$$XMMRegister, $src$$Address);
 3285   %}
 3286   ins_pipe(pipe_slow);
 3287 %}
 3288 
 3289 instruct mulF_imm(regF dst, immF con) %{
 3290   predicate((UseSSE>=1) && (UseAVX == 0));
 3291   match(Set dst (MulF dst con));
 3292   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3293   ins_cost(150);
 3294   ins_encode %{
 3295     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3296   %}
 3297   ins_pipe(pipe_slow);
 3298 %}
 3299 
 3300 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3301   predicate(UseAVX > 0);
 3302   match(Set dst (MulF src1 src2));
 3303 
 3304   format %{ "vmulss  $dst, $src1, $src2" %}
 3305   ins_cost(150);
 3306   ins_encode %{
 3307     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3308   %}
 3309   ins_pipe(pipe_slow);
 3310 %}
 3311 
 3312 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3313   predicate(UseAVX > 0);
 3314   match(Set dst (MulF src1 (LoadF src2)));
 3315 
 3316   format %{ "vmulss  $dst, $src1, $src2" %}
 3317   ins_cost(150);
 3318   ins_encode %{
 3319     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3320   %}
 3321   ins_pipe(pipe_slow);
 3322 %}
 3323 
 3324 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3325   predicate(UseAVX > 0);
 3326   match(Set dst (MulF src con));
 3327 
 3328   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3329   ins_cost(150);
 3330   ins_encode %{
 3331     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3332   %}
 3333   ins_pipe(pipe_slow);
 3334 %}
 3335 
 3336 instruct mulD_reg(regD dst, regD src) %{
 3337   predicate((UseSSE>=2) && (UseAVX == 0));
 3338   match(Set dst (MulD dst src));
 3339 
 3340   format %{ "mulsd   $dst, $src" %}
 3341   ins_cost(150);
 3342   ins_encode %{
 3343     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3344   %}
 3345   ins_pipe(pipe_slow);
 3346 %}
 3347 
 3348 instruct mulD_mem(regD dst, memory src) %{
 3349   predicate((UseSSE>=2) && (UseAVX == 0));
 3350   match(Set dst (MulD dst (LoadD src)));
 3351 
 3352   format %{ "mulsd   $dst, $src" %}
 3353   ins_cost(150);
 3354   ins_encode %{
 3355     __ mulsd($dst$$XMMRegister, $src$$Address);
 3356   %}
 3357   ins_pipe(pipe_slow);
 3358 %}
 3359 
 3360 instruct mulD_imm(regD dst, immD con) %{
 3361   predicate((UseSSE>=2) && (UseAVX == 0));
 3362   match(Set dst (MulD dst con));
 3363   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3364   ins_cost(150);
 3365   ins_encode %{
 3366     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3367   %}
 3368   ins_pipe(pipe_slow);
 3369 %}
 3370 
 3371 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3372   predicate(UseAVX > 0);
 3373   match(Set dst (MulD src1 src2));
 3374 
 3375   format %{ "vmulsd  $dst, $src1, $src2" %}
 3376   ins_cost(150);
 3377   ins_encode %{
 3378     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3379   %}
 3380   ins_pipe(pipe_slow);
 3381 %}
 3382 
 3383 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3384   predicate(UseAVX > 0);
 3385   match(Set dst (MulD src1 (LoadD src2)));
 3386 
 3387   format %{ "vmulsd  $dst, $src1, $src2" %}
 3388   ins_cost(150);
 3389   ins_encode %{
 3390     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3391   %}
 3392   ins_pipe(pipe_slow);
 3393 %}
 3394 
 3395 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3396   predicate(UseAVX > 0);
 3397   match(Set dst (MulD src con));
 3398 
 3399   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3400   ins_cost(150);
 3401   ins_encode %{
 3402     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3403   %}
 3404   ins_pipe(pipe_slow);
 3405 %}
 3406 
 3407 instruct divF_reg(regF dst, regF src) %{
 3408   predicate((UseSSE>=1) && (UseAVX == 0));
 3409   match(Set dst (DivF dst src));
 3410 
 3411   format %{ "divss   $dst, $src" %}
 3412   ins_cost(150);
 3413   ins_encode %{
 3414     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3415   %}
 3416   ins_pipe(pipe_slow);
 3417 %}
 3418 
 3419 instruct divF_mem(regF dst, memory src) %{
 3420   predicate((UseSSE>=1) && (UseAVX == 0));
 3421   match(Set dst (DivF dst (LoadF src)));
 3422 
 3423   format %{ "divss   $dst, $src" %}
 3424   ins_cost(150);
 3425   ins_encode %{
 3426     __ divss($dst$$XMMRegister, $src$$Address);
 3427   %}
 3428   ins_pipe(pipe_slow);
 3429 %}
 3430 
 3431 instruct divF_imm(regF dst, immF con) %{
 3432   predicate((UseSSE>=1) && (UseAVX == 0));
 3433   match(Set dst (DivF dst con));
 3434   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3435   ins_cost(150);
 3436   ins_encode %{
 3437     __ divss($dst$$XMMRegister, $constantaddress($con));
 3438   %}
 3439   ins_pipe(pipe_slow);
 3440 %}
 3441 
 3442 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3443   predicate(UseAVX > 0);
 3444   match(Set dst (DivF src1 src2));
 3445 
 3446   format %{ "vdivss  $dst, $src1, $src2" %}
 3447   ins_cost(150);
 3448   ins_encode %{
 3449     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3450   %}
 3451   ins_pipe(pipe_slow);
 3452 %}
 3453 
 3454 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3455   predicate(UseAVX > 0);
 3456   match(Set dst (DivF src1 (LoadF src2)));
 3457 
 3458   format %{ "vdivss  $dst, $src1, $src2" %}
 3459   ins_cost(150);
 3460   ins_encode %{
 3461     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3462   %}
 3463   ins_pipe(pipe_slow);
 3464 %}
 3465 
 3466 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3467   predicate(UseAVX > 0);
 3468   match(Set dst (DivF src con));
 3469 
 3470   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3471   ins_cost(150);
 3472   ins_encode %{
 3473     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3474   %}
 3475   ins_pipe(pipe_slow);
 3476 %}
 3477 
 3478 instruct divD_reg(regD dst, regD src) %{
 3479   predicate((UseSSE>=2) && (UseAVX == 0));
 3480   match(Set dst (DivD dst src));
 3481 
 3482   format %{ "divsd   $dst, $src" %}
 3483   ins_cost(150);
 3484   ins_encode %{
 3485     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3486   %}
 3487   ins_pipe(pipe_slow);
 3488 %}
 3489 
 3490 instruct divD_mem(regD dst, memory src) %{
 3491   predicate((UseSSE>=2) && (UseAVX == 0));
 3492   match(Set dst (DivD dst (LoadD src)));
 3493 
 3494   format %{ "divsd   $dst, $src" %}
 3495   ins_cost(150);
 3496   ins_encode %{
 3497     __ divsd($dst$$XMMRegister, $src$$Address);
 3498   %}
 3499   ins_pipe(pipe_slow);
 3500 %}
 3501 
 3502 instruct divD_imm(regD dst, immD con) %{
 3503   predicate((UseSSE>=2) && (UseAVX == 0));
 3504   match(Set dst (DivD dst con));
 3505   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3506   ins_cost(150);
 3507   ins_encode %{
 3508     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3509   %}
 3510   ins_pipe(pipe_slow);
 3511 %}
 3512 
 3513 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3514   predicate(UseAVX > 0);
 3515   match(Set dst (DivD src1 src2));
 3516 
 3517   format %{ "vdivsd  $dst, $src1, $src2" %}
 3518   ins_cost(150);
 3519   ins_encode %{
 3520     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3521   %}
 3522   ins_pipe(pipe_slow);
 3523 %}
 3524 
 3525 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3526   predicate(UseAVX > 0);
 3527   match(Set dst (DivD src1 (LoadD src2)));
 3528 
 3529   format %{ "vdivsd  $dst, $src1, $src2" %}
 3530   ins_cost(150);
 3531   ins_encode %{
 3532     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3533   %}
 3534   ins_pipe(pipe_slow);
 3535 %}
 3536 
 3537 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3538   predicate(UseAVX > 0);
 3539   match(Set dst (DivD src con));
 3540 
 3541   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3542   ins_cost(150);
 3543   ins_encode %{
 3544     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3545   %}
 3546   ins_pipe(pipe_slow);
 3547 %}
 3548 
 3549 instruct absF_reg(regF dst) %{
 3550   predicate((UseSSE>=1) && (UseAVX == 0));
 3551   match(Set dst (AbsF dst));
 3552   ins_cost(150);
 3553   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3554   ins_encode %{
 3555     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3556   %}
 3557   ins_pipe(pipe_slow);
 3558 %}
 3559 
 3560 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3561   predicate(UseAVX > 0);
 3562   match(Set dst (AbsF src));
 3563   ins_cost(150);
 3564   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3565   ins_encode %{
 3566     int vlen_enc = Assembler::AVX_128bit;
 3567     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3568               ExternalAddress(float_signmask()), vlen_enc);
 3569   %}
 3570   ins_pipe(pipe_slow);
 3571 %}
 3572 
 3573 instruct absD_reg(regD dst) %{
 3574   predicate((UseSSE>=2) && (UseAVX == 0));
 3575   match(Set dst (AbsD dst));
 3576   ins_cost(150);
 3577   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3578             "# abs double by sign masking" %}
 3579   ins_encode %{
 3580     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3581   %}
 3582   ins_pipe(pipe_slow);
 3583 %}
 3584 
 3585 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3586   predicate(UseAVX > 0);
 3587   match(Set dst (AbsD src));
 3588   ins_cost(150);
 3589   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3590             "# abs double by sign masking" %}
 3591   ins_encode %{
 3592     int vlen_enc = Assembler::AVX_128bit;
 3593     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3594               ExternalAddress(double_signmask()), vlen_enc);
 3595   %}
 3596   ins_pipe(pipe_slow);
 3597 %}
 3598 
 3599 instruct negF_reg(regF dst) %{
 3600   predicate((UseSSE>=1) && (UseAVX == 0));
 3601   match(Set dst (NegF dst));
 3602   ins_cost(150);
 3603   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3604   ins_encode %{
 3605     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3606   %}
 3607   ins_pipe(pipe_slow);
 3608 %}
 3609 
 3610 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3611   predicate(UseAVX > 0);
 3612   match(Set dst (NegF src));
 3613   ins_cost(150);
 3614   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3615   ins_encode %{
 3616     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3617                  ExternalAddress(float_signflip()));
 3618   %}
 3619   ins_pipe(pipe_slow);
 3620 %}
 3621 
 3622 instruct negD_reg(regD dst) %{
 3623   predicate((UseSSE>=2) && (UseAVX == 0));
 3624   match(Set dst (NegD dst));
 3625   ins_cost(150);
 3626   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3627             "# neg double by sign flipping" %}
 3628   ins_encode %{
 3629     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3630   %}
 3631   ins_pipe(pipe_slow);
 3632 %}
 3633 
 3634 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3635   predicate(UseAVX > 0);
 3636   match(Set dst (NegD src));
 3637   ins_cost(150);
 3638   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3639             "# neg double by sign flipping" %}
 3640   ins_encode %{
 3641     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3642                  ExternalAddress(double_signflip()));
 3643   %}
 3644   ins_pipe(pipe_slow);
 3645 %}
 3646 
 3647 // sqrtss instruction needs destination register to be pre initialized for best performance
 3648 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3649 instruct sqrtF_reg(regF dst) %{
 3650   predicate(UseSSE>=1);
 3651   match(Set dst (SqrtF dst));
 3652   format %{ "sqrtss  $dst, $dst" %}
 3653   ins_encode %{
 3654     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3655   %}
 3656   ins_pipe(pipe_slow);
 3657 %}
 3658 
 3659 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3660 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3661 instruct sqrtD_reg(regD dst) %{
 3662   predicate(UseSSE>=2);
 3663   match(Set dst (SqrtD dst));
 3664   format %{ "sqrtsd  $dst, $dst" %}
 3665   ins_encode %{
 3666     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3667   %}
 3668   ins_pipe(pipe_slow);
 3669 %}
 3670 
 3671 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3672   effect(TEMP tmp);
 3673   match(Set dst (ConvF2HF src));
 3674   ins_cost(125);
 3675   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3676   ins_encode %{
 3677     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3678   %}
 3679   ins_pipe( pipe_slow );
 3680 %}
 3681 
 3682 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3683   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3684   effect(TEMP ktmp, TEMP rtmp);
 3685   match(Set mem (StoreC mem (ConvF2HF src)));
 3686   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3687   ins_encode %{
 3688     __ movl($rtmp$$Register, 0x1);
 3689     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3690     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3691   %}
 3692   ins_pipe( pipe_slow );
 3693 %}
 3694 
 3695 instruct vconvF2HF(vec dst, vec src) %{
 3696   match(Set dst (VectorCastF2HF src));
 3697   format %{ "vector_conv_F2HF $dst $src" %}
 3698   ins_encode %{
 3699     int vlen_enc = vector_length_encoding(this, $src);
 3700     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3701   %}
 3702   ins_pipe( pipe_slow );
 3703 %}
 3704 
 3705 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3706   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3707   format %{ "vcvtps2ph $mem,$src" %}
 3708   ins_encode %{
 3709     int vlen_enc = vector_length_encoding(this, $src);
 3710     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3711   %}
 3712   ins_pipe( pipe_slow );
 3713 %}
 3714 
 3715 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3716   match(Set dst (ConvHF2F src));
 3717   format %{ "vcvtph2ps $dst,$src" %}
 3718   ins_encode %{
 3719     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3720   %}
 3721   ins_pipe( pipe_slow );
 3722 %}
 3723 
 3724 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3725   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3726   format %{ "vcvtph2ps $dst,$mem" %}
 3727   ins_encode %{
 3728     int vlen_enc = vector_length_encoding(this);
 3729     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3730   %}
 3731   ins_pipe( pipe_slow );
 3732 %}
 3733 
 3734 instruct vconvHF2F(vec dst, vec src) %{
 3735   match(Set dst (VectorCastHF2F src));
 3736   ins_cost(125);
 3737   format %{ "vector_conv_HF2F $dst,$src" %}
 3738   ins_encode %{
 3739     int vlen_enc = vector_length_encoding(this);
 3740     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3741   %}
 3742   ins_pipe( pipe_slow );
 3743 %}
 3744 
 3745 // ---------------------------------------- VectorReinterpret ------------------------------------
 3746 instruct reinterpret_mask(kReg dst) %{
 3747   predicate(n->bottom_type()->isa_vectmask() &&
 3748             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3749   match(Set dst (VectorReinterpret dst));
 3750   ins_cost(125);
 3751   format %{ "vector_reinterpret $dst\t!" %}
 3752   ins_encode %{
 3753     // empty
 3754   %}
 3755   ins_pipe( pipe_slow );
 3756 %}
 3757 
 3758 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3759   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3760             n->bottom_type()->isa_vectmask() &&
 3761             n->in(1)->bottom_type()->isa_vectmask() &&
 3762             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3763             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3764   match(Set dst (VectorReinterpret src));
 3765   effect(TEMP xtmp);
 3766   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3767   ins_encode %{
 3768      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3769      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3770      assert(src_sz == dst_sz , "src and dst size mismatch");
 3771      int vlen_enc = vector_length_encoding(src_sz);
 3772      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3773      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3774   %}
 3775   ins_pipe( pipe_slow );
 3776 %}
 3777 
 3778 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3779   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3780             n->bottom_type()->isa_vectmask() &&
 3781             n->in(1)->bottom_type()->isa_vectmask() &&
 3782             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3783              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3784             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3785   match(Set dst (VectorReinterpret src));
 3786   effect(TEMP xtmp);
 3787   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3788   ins_encode %{
 3789      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3790      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3791      assert(src_sz == dst_sz , "src and dst size mismatch");
 3792      int vlen_enc = vector_length_encoding(src_sz);
 3793      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3794      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3795   %}
 3796   ins_pipe( pipe_slow );
 3797 %}
 3798 
 3799 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3800   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3801             n->bottom_type()->isa_vectmask() &&
 3802             n->in(1)->bottom_type()->isa_vectmask() &&
 3803             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3804              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3805             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3806   match(Set dst (VectorReinterpret src));
 3807   effect(TEMP xtmp);
 3808   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3809   ins_encode %{
 3810      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3811      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3812      assert(src_sz == dst_sz , "src and dst size mismatch");
 3813      int vlen_enc = vector_length_encoding(src_sz);
 3814      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3815      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3816   %}
 3817   ins_pipe( pipe_slow );
 3818 %}
 3819 
 3820 instruct reinterpret(vec dst) %{
 3821   predicate(!n->bottom_type()->isa_vectmask() &&
 3822             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3823   match(Set dst (VectorReinterpret dst));
 3824   ins_cost(125);
 3825   format %{ "vector_reinterpret $dst\t!" %}
 3826   ins_encode %{
 3827     // empty
 3828   %}
 3829   ins_pipe( pipe_slow );
 3830 %}
 3831 
 3832 instruct reinterpret_expand(vec dst, vec src) %{
 3833   predicate(UseAVX == 0 &&
 3834             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3835   match(Set dst (VectorReinterpret src));
 3836   ins_cost(125);
 3837   effect(TEMP dst);
 3838   format %{ "vector_reinterpret_expand $dst,$src" %}
 3839   ins_encode %{
 3840     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3841     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3842 
 3843     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3844     if (src_vlen_in_bytes == 4) {
 3845       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3846     } else {
 3847       assert(src_vlen_in_bytes == 8, "");
 3848       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3849     }
 3850     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3851   %}
 3852   ins_pipe( pipe_slow );
 3853 %}
 3854 
 3855 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3856   predicate(UseAVX > 0 &&
 3857             !n->bottom_type()->isa_vectmask() &&
 3858             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3859             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3860   match(Set dst (VectorReinterpret src));
 3861   ins_cost(125);
 3862   format %{ "vector_reinterpret_expand $dst,$src" %}
 3863   ins_encode %{
 3864     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3865   %}
 3866   ins_pipe( pipe_slow );
 3867 %}
 3868 
 3869 
 3870 instruct vreinterpret_expand(legVec dst, vec src) %{
 3871   predicate(UseAVX > 0 &&
 3872             !n->bottom_type()->isa_vectmask() &&
 3873             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3874             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3875   match(Set dst (VectorReinterpret src));
 3876   ins_cost(125);
 3877   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3878   ins_encode %{
 3879     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3880       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3881       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3882       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3883       default: ShouldNotReachHere();
 3884     }
 3885   %}
 3886   ins_pipe( pipe_slow );
 3887 %}
 3888 
 3889 instruct reinterpret_shrink(vec dst, legVec src) %{
 3890   predicate(!n->bottom_type()->isa_vectmask() &&
 3891             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3892   match(Set dst (VectorReinterpret src));
 3893   ins_cost(125);
 3894   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3895   ins_encode %{
 3896     switch (Matcher::vector_length_in_bytes(this)) {
 3897       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3898       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3899       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3900       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3901       default: ShouldNotReachHere();
 3902     }
 3903   %}
 3904   ins_pipe( pipe_slow );
 3905 %}
 3906 
 3907 // ----------------------------------------------------------------------------------------------------
 3908 
 3909 #ifdef _LP64
 3910 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3911   match(Set dst (RoundDoubleMode src rmode));
 3912   format %{ "roundsd $dst,$src" %}
 3913   ins_cost(150);
 3914   ins_encode %{
 3915     assert(UseSSE >= 4, "required");
 3916     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3917   %}
 3918   ins_pipe(pipe_slow);
 3919 %}
 3920 
 3921 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3922   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3923   format %{ "roundsd $dst,$src" %}
 3924   ins_cost(150);
 3925   ins_encode %{
 3926     assert(UseSSE >= 4, "required");
 3927     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3928   %}
 3929   ins_pipe(pipe_slow);
 3930 %}
 3931 
 3932 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3933   match(Set dst (RoundDoubleMode con rmode));
 3934   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3935   ins_cost(150);
 3936   ins_encode %{
 3937     assert(UseSSE >= 4, "required");
 3938     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3939   %}
 3940   ins_pipe(pipe_slow);
 3941 %}
 3942 
 3943 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3944   predicate(Matcher::vector_length(n) < 8);
 3945   match(Set dst (RoundDoubleModeV src rmode));
 3946   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3947   ins_encode %{
 3948     assert(UseAVX > 0, "required");
 3949     int vlen_enc = vector_length_encoding(this);
 3950     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3951   %}
 3952   ins_pipe( pipe_slow );
 3953 %}
 3954 
 3955 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3956   predicate(Matcher::vector_length(n) == 8);
 3957   match(Set dst (RoundDoubleModeV src rmode));
 3958   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3959   ins_encode %{
 3960     assert(UseAVX > 2, "required");
 3961     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3962   %}
 3963   ins_pipe( pipe_slow );
 3964 %}
 3965 
 3966 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3967   predicate(Matcher::vector_length(n) < 8);
 3968   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3969   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3970   ins_encode %{
 3971     assert(UseAVX > 0, "required");
 3972     int vlen_enc = vector_length_encoding(this);
 3973     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3974   %}
 3975   ins_pipe( pipe_slow );
 3976 %}
 3977 
 3978 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3979   predicate(Matcher::vector_length(n) == 8);
 3980   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3981   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3982   ins_encode %{
 3983     assert(UseAVX > 2, "required");
 3984     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3985   %}
 3986   ins_pipe( pipe_slow );
 3987 %}
 3988 #endif // _LP64
 3989 
 3990 instruct onspinwait() %{
 3991   match(OnSpinWait);
 3992   ins_cost(200);
 3993 
 3994   format %{
 3995     $$template
 3996     $$emit$$"pause\t! membar_onspinwait"
 3997   %}
 3998   ins_encode %{
 3999     __ pause();
 4000   %}
 4001   ins_pipe(pipe_slow);
 4002 %}
 4003 
 4004 // a * b + c
 4005 instruct fmaD_reg(regD a, regD b, regD c) %{
 4006   match(Set c (FmaD  c (Binary a b)));
 4007   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4008   ins_cost(150);
 4009   ins_encode %{
 4010     assert(UseFMA, "Needs FMA instructions support.");
 4011     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4012   %}
 4013   ins_pipe( pipe_slow );
 4014 %}
 4015 
 4016 // a * b + c
 4017 instruct fmaF_reg(regF a, regF b, regF c) %{
 4018   match(Set c (FmaF  c (Binary a b)));
 4019   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4020   ins_cost(150);
 4021   ins_encode %{
 4022     assert(UseFMA, "Needs FMA instructions support.");
 4023     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4024   %}
 4025   ins_pipe( pipe_slow );
 4026 %}
 4027 
 4028 // ====================VECTOR INSTRUCTIONS=====================================
 4029 
 4030 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4031 instruct MoveVec2Leg(legVec dst, vec src) %{
 4032   match(Set dst src);
 4033   format %{ "" %}
 4034   ins_encode %{
 4035     ShouldNotReachHere();
 4036   %}
 4037   ins_pipe( fpu_reg_reg );
 4038 %}
 4039 
 4040 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4041   match(Set dst src);
 4042   format %{ "" %}
 4043   ins_encode %{
 4044     ShouldNotReachHere();
 4045   %}
 4046   ins_pipe( fpu_reg_reg );
 4047 %}
 4048 
 4049 // ============================================================================
 4050 
 4051 // Load vectors generic operand pattern
 4052 instruct loadV(vec dst, memory mem) %{
 4053   match(Set dst (LoadVector mem));
 4054   ins_cost(125);
 4055   format %{ "load_vector $dst,$mem" %}
 4056   ins_encode %{
 4057     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4058   %}
 4059   ins_pipe( pipe_slow );
 4060 %}
 4061 
 4062 // Store vectors generic operand pattern.
 4063 instruct storeV(memory mem, vec src) %{
 4064   match(Set mem (StoreVector mem src));
 4065   ins_cost(145);
 4066   format %{ "store_vector $mem,$src\n\t" %}
 4067   ins_encode %{
 4068     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4069       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4070       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4071       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4072       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4073       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4074       default: ShouldNotReachHere();
 4075     }
 4076   %}
 4077   ins_pipe( pipe_slow );
 4078 %}
 4079 
 4080 // ---------------------------------------- Gather ------------------------------------
 4081 
 4082 // Gather INT, LONG, FLOAT, DOUBLE
 4083 
 4084 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4085   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4086   match(Set dst (LoadVectorGather mem idx));
 4087   effect(TEMP dst, TEMP tmp, TEMP mask);
 4088   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4089   ins_encode %{
 4090     assert(UseAVX >= 2, "sanity");
 4091 
 4092     int vlen_enc = vector_length_encoding(this);
 4093     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4094 
 4095     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 4096     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4097 
 4098     if (vlen_enc == Assembler::AVX_128bit) {
 4099       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4100     } else {
 4101       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4102     }
 4103     __ lea($tmp$$Register, $mem$$Address);
 4104     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4105   %}
 4106   ins_pipe( pipe_slow );
 4107 %}
 4108 
 4109 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4110   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4111   match(Set dst (LoadVectorGather mem idx));
 4112   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4113   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4114   ins_encode %{
 4115     assert(UseAVX > 2, "sanity");
 4116 
 4117     int vlen_enc = vector_length_encoding(this);
 4118     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4119 
 4120     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4121 
 4122     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4123     __ lea($tmp$$Register, $mem$$Address);
 4124     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4125   %}
 4126   ins_pipe( pipe_slow );
 4127 %}
 4128 
 4129 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4130   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4131   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4132   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4133   ins_encode %{
 4134     assert(UseAVX > 2, "sanity");
 4135     int vlen_enc = vector_length_encoding(this);
 4136     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4137     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4138     // Note: Since gather instruction partially updates the opmask register used
 4139     // for predication hense moving mask operand to a temporary.
 4140     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4141     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4142     __ lea($tmp$$Register, $mem$$Address);
 4143     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4144   %}
 4145   ins_pipe( pipe_slow );
 4146 %}
 4147 // ====================Scatter=======================================
 4148 
 4149 // Scatter INT, LONG, FLOAT, DOUBLE
 4150 
 4151 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4152   predicate(UseAVX > 2);
 4153   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4154   effect(TEMP tmp, TEMP ktmp);
 4155   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4156   ins_encode %{
 4157     int vlen_enc = vector_length_encoding(this, $src);
 4158     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4159 
 4160     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4161     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4162 
 4163     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4164     __ lea($tmp$$Register, $mem$$Address);
 4165     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4166   %}
 4167   ins_pipe( pipe_slow );
 4168 %}
 4169 
 4170 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4171   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4172   effect(TEMP tmp, TEMP ktmp);
 4173   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4174   ins_encode %{
 4175     int vlen_enc = vector_length_encoding(this, $src);
 4176     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4177     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4178     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4179     // Note: Since scatter instruction partially updates the opmask register used
 4180     // for predication hense moving mask operand to a temporary.
 4181     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4182     __ lea($tmp$$Register, $mem$$Address);
 4183     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4184   %}
 4185   ins_pipe( pipe_slow );
 4186 %}
 4187 
 4188 // ====================REPLICATE=======================================
 4189 
 4190 // Replicate byte scalar to be vector
 4191 instruct vReplB_reg(vec dst, rRegI src) %{
 4192   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4193   match(Set dst (Replicate src));
 4194   format %{ "replicateB $dst,$src" %}
 4195   ins_encode %{
 4196     uint vlen = Matcher::vector_length(this);
 4197     if (UseAVX >= 2) {
 4198       int vlen_enc = vector_length_encoding(this);
 4199       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4200         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4201         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4202       } else {
 4203         __ movdl($dst$$XMMRegister, $src$$Register);
 4204         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4205       }
 4206     } else {
 4207        assert(UseAVX < 2, "");
 4208       __ movdl($dst$$XMMRegister, $src$$Register);
 4209       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4210       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4211       if (vlen >= 16) {
 4212         assert(vlen == 16, "");
 4213         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4214       }
 4215     }
 4216   %}
 4217   ins_pipe( pipe_slow );
 4218 %}
 4219 
 4220 instruct ReplB_mem(vec dst, memory mem) %{
 4221   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4222   match(Set dst (Replicate (LoadB mem)));
 4223   format %{ "replicateB $dst,$mem" %}
 4224   ins_encode %{
 4225     int vlen_enc = vector_length_encoding(this);
 4226     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4227   %}
 4228   ins_pipe( pipe_slow );
 4229 %}
 4230 
 4231 // ====================ReplicateS=======================================
 4232 
 4233 instruct vReplS_reg(vec dst, rRegI src) %{
 4234   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4235   match(Set dst (Replicate src));
 4236   format %{ "replicateS $dst,$src" %}
 4237   ins_encode %{
 4238     uint vlen = Matcher::vector_length(this);
 4239     int vlen_enc = vector_length_encoding(this);
 4240     if (UseAVX >= 2) {
 4241       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4242         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4243         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4244       } else {
 4245         __ movdl($dst$$XMMRegister, $src$$Register);
 4246         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4247       }
 4248     } else {
 4249       assert(UseAVX < 2, "");
 4250       __ movdl($dst$$XMMRegister, $src$$Register);
 4251       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4252       if (vlen >= 8) {
 4253         assert(vlen == 8, "");
 4254         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4255       }
 4256     }
 4257   %}
 4258   ins_pipe( pipe_slow );
 4259 %}
 4260 
 4261 instruct ReplS_mem(vec dst, memory mem) %{
 4262   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4263   match(Set dst (Replicate (LoadS mem)));
 4264   format %{ "replicateS $dst,$mem" %}
 4265   ins_encode %{
 4266     int vlen_enc = vector_length_encoding(this);
 4267     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4268   %}
 4269   ins_pipe( pipe_slow );
 4270 %}
 4271 
 4272 // ====================ReplicateI=======================================
 4273 
 4274 instruct ReplI_reg(vec dst, rRegI src) %{
 4275   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4276   match(Set dst (Replicate src));
 4277   format %{ "replicateI $dst,$src" %}
 4278   ins_encode %{
 4279     uint vlen = Matcher::vector_length(this);
 4280     int vlen_enc = vector_length_encoding(this);
 4281     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4282       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4283     } else if (VM_Version::supports_avx2()) {
 4284       __ movdl($dst$$XMMRegister, $src$$Register);
 4285       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4286     } else {
 4287       __ movdl($dst$$XMMRegister, $src$$Register);
 4288       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4289     }
 4290   %}
 4291   ins_pipe( pipe_slow );
 4292 %}
 4293 
 4294 instruct ReplI_mem(vec dst, memory mem) %{
 4295   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4296   match(Set dst (Replicate (LoadI mem)));
 4297   format %{ "replicateI $dst,$mem" %}
 4298   ins_encode %{
 4299     int vlen_enc = vector_length_encoding(this);
 4300     if (VM_Version::supports_avx2()) {
 4301       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4302     } else if (VM_Version::supports_avx()) {
 4303       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4304     } else {
 4305       __ movdl($dst$$XMMRegister, $mem$$Address);
 4306       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4307     }
 4308   %}
 4309   ins_pipe( pipe_slow );
 4310 %}
 4311 
 4312 instruct ReplI_imm(vec dst, immI con) %{
 4313   predicate(Matcher::is_non_long_integral_vector(n));
 4314   match(Set dst (Replicate con));
 4315   format %{ "replicateI $dst,$con" %}
 4316   ins_encode %{
 4317     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4318         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4319             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4320                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4321     BasicType bt = Matcher::vector_element_basic_type(this);
 4322     int vlen = Matcher::vector_length_in_bytes(this);
 4323     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4324   %}
 4325   ins_pipe( pipe_slow );
 4326 %}
 4327 
 4328 // Replicate scalar zero to be vector
 4329 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4330   predicate(Matcher::is_non_long_integral_vector(n));
 4331   match(Set dst (Replicate zero));
 4332   format %{ "replicateI $dst,$zero" %}
 4333   ins_encode %{
 4334     int vlen_enc = vector_length_encoding(this);
 4335     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4336       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4337     } else {
 4338       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4339     }
 4340   %}
 4341   ins_pipe( fpu_reg_reg );
 4342 %}
 4343 
 4344 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4345   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4346   match(Set dst (Replicate con));
 4347   format %{ "vallones $dst" %}
 4348   ins_encode %{
 4349     int vector_len = vector_length_encoding(this);
 4350     __ vallones($dst$$XMMRegister, vector_len);
 4351   %}
 4352   ins_pipe( pipe_slow );
 4353 %}
 4354 
 4355 // ====================ReplicateL=======================================
 4356 
 4357 #ifdef _LP64
 4358 // Replicate long (8 byte) scalar to be vector
 4359 instruct ReplL_reg(vec dst, rRegL src) %{
 4360   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4361   match(Set dst (Replicate src));
 4362   format %{ "replicateL $dst,$src" %}
 4363   ins_encode %{
 4364     int vlen = Matcher::vector_length(this);
 4365     int vlen_enc = vector_length_encoding(this);
 4366     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4367       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4368     } else if (VM_Version::supports_avx2()) {
 4369       __ movdq($dst$$XMMRegister, $src$$Register);
 4370       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4371     } else {
 4372       __ movdq($dst$$XMMRegister, $src$$Register);
 4373       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4374     }
 4375   %}
 4376   ins_pipe( pipe_slow );
 4377 %}
 4378 #else // _LP64
 4379 // Replicate long (8 byte) scalar to be vector
 4380 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4381   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4382   match(Set dst (Replicate src));
 4383   effect(TEMP dst, USE src, TEMP tmp);
 4384   format %{ "replicateL $dst,$src" %}
 4385   ins_encode %{
 4386     uint vlen = Matcher::vector_length(this);
 4387     if (vlen == 2) {
 4388       __ movdl($dst$$XMMRegister, $src$$Register);
 4389       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4390       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4391       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4392     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4393       int vlen_enc = Assembler::AVX_256bit;
 4394       __ movdl($dst$$XMMRegister, $src$$Register);
 4395       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4396       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4397       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4398     } else {
 4399       __ movdl($dst$$XMMRegister, $src$$Register);
 4400       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4401       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4402       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4403       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4404     }
 4405   %}
 4406   ins_pipe( pipe_slow );
 4407 %}
 4408 
 4409 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4410   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4411   match(Set dst (Replicate src));
 4412   effect(TEMP dst, USE src, TEMP tmp);
 4413   format %{ "replicateL $dst,$src" %}
 4414   ins_encode %{
 4415     if (VM_Version::supports_avx512vl()) {
 4416       __ movdl($dst$$XMMRegister, $src$$Register);
 4417       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4418       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4419       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4420       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4421       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4422     } else {
 4423       int vlen_enc = Assembler::AVX_512bit;
 4424       __ movdl($dst$$XMMRegister, $src$$Register);
 4425       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4426       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4427       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4428     }
 4429   %}
 4430   ins_pipe( pipe_slow );
 4431 %}
 4432 #endif // _LP64
 4433 
 4434 instruct ReplL_mem(vec dst, memory mem) %{
 4435   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4436   match(Set dst (Replicate (LoadL mem)));
 4437   format %{ "replicateL $dst,$mem" %}
 4438   ins_encode %{
 4439     int vlen_enc = vector_length_encoding(this);
 4440     if (VM_Version::supports_avx2()) {
 4441       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4442     } else if (VM_Version::supports_sse3()) {
 4443       __ movddup($dst$$XMMRegister, $mem$$Address);
 4444     } else {
 4445       __ movq($dst$$XMMRegister, $mem$$Address);
 4446       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4447     }
 4448   %}
 4449   ins_pipe( pipe_slow );
 4450 %}
 4451 
 4452 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4453 instruct ReplL_imm(vec dst, immL con) %{
 4454   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4455   match(Set dst (Replicate con));
 4456   format %{ "replicateL $dst,$con" %}
 4457   ins_encode %{
 4458     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4459     int vlen = Matcher::vector_length_in_bytes(this);
 4460     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4461   %}
 4462   ins_pipe( pipe_slow );
 4463 %}
 4464 
 4465 instruct ReplL_zero(vec dst, immL0 zero) %{
 4466   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4467   match(Set dst (Replicate zero));
 4468   format %{ "replicateL $dst,$zero" %}
 4469   ins_encode %{
 4470     int vlen_enc = vector_length_encoding(this);
 4471     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4472       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4473     } else {
 4474       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4475     }
 4476   %}
 4477   ins_pipe( fpu_reg_reg );
 4478 %}
 4479 
 4480 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4481   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4482   match(Set dst (Replicate con));
 4483   format %{ "vallones $dst" %}
 4484   ins_encode %{
 4485     int vector_len = vector_length_encoding(this);
 4486     __ vallones($dst$$XMMRegister, vector_len);
 4487   %}
 4488   ins_pipe( pipe_slow );
 4489 %}
 4490 
 4491 // ====================ReplicateF=======================================
 4492 
 4493 instruct vReplF_reg(vec dst, vlRegF src) %{
 4494   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4495   match(Set dst (Replicate src));
 4496   format %{ "replicateF $dst,$src" %}
 4497   ins_encode %{
 4498     uint vlen = Matcher::vector_length(this);
 4499     int vlen_enc = vector_length_encoding(this);
 4500     if (vlen <= 4) {
 4501       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4502     } else if (VM_Version::supports_avx2()) {
 4503       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4504     } else {
 4505       assert(vlen == 8, "sanity");
 4506       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4507       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4508     }
 4509   %}
 4510   ins_pipe( pipe_slow );
 4511 %}
 4512 
 4513 instruct ReplF_reg(vec dst, vlRegF src) %{
 4514   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4515   match(Set dst (Replicate src));
 4516   format %{ "replicateF $dst,$src" %}
 4517   ins_encode %{
 4518     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4519   %}
 4520   ins_pipe( pipe_slow );
 4521 %}
 4522 
 4523 instruct ReplF_mem(vec dst, memory mem) %{
 4524   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4525   match(Set dst (Replicate (LoadF mem)));
 4526   format %{ "replicateF $dst,$mem" %}
 4527   ins_encode %{
 4528     int vlen_enc = vector_length_encoding(this);
 4529     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4530   %}
 4531   ins_pipe( pipe_slow );
 4532 %}
 4533 
 4534 // Replicate float scalar immediate to be vector by loading from const table.
 4535 instruct ReplF_imm(vec dst, immF con) %{
 4536   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4537   match(Set dst (Replicate con));
 4538   format %{ "replicateF $dst,$con" %}
 4539   ins_encode %{
 4540     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4541         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4542     int vlen = Matcher::vector_length_in_bytes(this);
 4543     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4544   %}
 4545   ins_pipe( pipe_slow );
 4546 %}
 4547 
 4548 instruct ReplF_zero(vec dst, immF0 zero) %{
 4549   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4550   match(Set dst (Replicate zero));
 4551   format %{ "replicateF $dst,$zero" %}
 4552   ins_encode %{
 4553     int vlen_enc = vector_length_encoding(this);
 4554     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4555       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4556     } else {
 4557       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4558     }
 4559   %}
 4560   ins_pipe( fpu_reg_reg );
 4561 %}
 4562 
 4563 // ====================ReplicateD=======================================
 4564 
 4565 // Replicate double (8 bytes) scalar to be vector
 4566 instruct vReplD_reg(vec dst, vlRegD src) %{
 4567   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4568   match(Set dst (Replicate src));
 4569   format %{ "replicateD $dst,$src" %}
 4570   ins_encode %{
 4571     uint vlen = Matcher::vector_length(this);
 4572     int vlen_enc = vector_length_encoding(this);
 4573     if (vlen <= 2) {
 4574       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4575     } else if (VM_Version::supports_avx2()) {
 4576       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4577     } else {
 4578       assert(vlen == 4, "sanity");
 4579       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4580       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4581     }
 4582   %}
 4583   ins_pipe( pipe_slow );
 4584 %}
 4585 
 4586 instruct ReplD_reg(vec dst, vlRegD src) %{
 4587   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4588   match(Set dst (Replicate src));
 4589   format %{ "replicateD $dst,$src" %}
 4590   ins_encode %{
 4591     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4592   %}
 4593   ins_pipe( pipe_slow );
 4594 %}
 4595 
 4596 instruct ReplD_mem(vec dst, memory mem) %{
 4597   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4598   match(Set dst (Replicate (LoadD mem)));
 4599   format %{ "replicateD $dst,$mem" %}
 4600   ins_encode %{
 4601     if (Matcher::vector_length(this) >= 4) {
 4602       int vlen_enc = vector_length_encoding(this);
 4603       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4604     } else {
 4605       __ movddup($dst$$XMMRegister, $mem$$Address);
 4606     }
 4607   %}
 4608   ins_pipe( pipe_slow );
 4609 %}
 4610 
 4611 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4612 instruct ReplD_imm(vec dst, immD con) %{
 4613   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4614   match(Set dst (Replicate con));
 4615   format %{ "replicateD $dst,$con" %}
 4616   ins_encode %{
 4617     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4618     int vlen = Matcher::vector_length_in_bytes(this);
 4619     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4620   %}
 4621   ins_pipe( pipe_slow );
 4622 %}
 4623 
 4624 instruct ReplD_zero(vec dst, immD0 zero) %{
 4625   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4626   match(Set dst (Replicate zero));
 4627   format %{ "replicateD $dst,$zero" %}
 4628   ins_encode %{
 4629     int vlen_enc = vector_length_encoding(this);
 4630     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4631       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4632     } else {
 4633       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4634     }
 4635   %}
 4636   ins_pipe( fpu_reg_reg );
 4637 %}
 4638 
 4639 // ====================VECTOR INSERT=======================================
 4640 
 4641 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4642   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4643   match(Set dst (VectorInsert (Binary dst val) idx));
 4644   format %{ "vector_insert $dst,$val,$idx" %}
 4645   ins_encode %{
 4646     assert(UseSSE >= 4, "required");
 4647     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4648 
 4649     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4650 
 4651     assert(is_integral_type(elem_bt), "");
 4652     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4653 
 4654     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4655   %}
 4656   ins_pipe( pipe_slow );
 4657 %}
 4658 
 4659 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4660   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4661   match(Set dst (VectorInsert (Binary src val) idx));
 4662   effect(TEMP vtmp);
 4663   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4664   ins_encode %{
 4665     int vlen_enc = Assembler::AVX_256bit;
 4666     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4667     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4668     int log2epr = log2(elem_per_lane);
 4669 
 4670     assert(is_integral_type(elem_bt), "sanity");
 4671     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4672 
 4673     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4674     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4675     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4676     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4677     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4678   %}
 4679   ins_pipe( pipe_slow );
 4680 %}
 4681 
 4682 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4683   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4684   match(Set dst (VectorInsert (Binary src val) idx));
 4685   effect(TEMP vtmp);
 4686   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4687   ins_encode %{
 4688     assert(UseAVX > 2, "sanity");
 4689 
 4690     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4691     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4692     int log2epr = log2(elem_per_lane);
 4693 
 4694     assert(is_integral_type(elem_bt), "");
 4695     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4696 
 4697     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4698     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4699     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4700     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4701     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4702   %}
 4703   ins_pipe( pipe_slow );
 4704 %}
 4705 
 4706 #ifdef _LP64
 4707 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4708   predicate(Matcher::vector_length(n) == 2);
 4709   match(Set dst (VectorInsert (Binary dst val) idx));
 4710   format %{ "vector_insert $dst,$val,$idx" %}
 4711   ins_encode %{
 4712     assert(UseSSE >= 4, "required");
 4713     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4714     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4715 
 4716     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4717   %}
 4718   ins_pipe( pipe_slow );
 4719 %}
 4720 
 4721 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4722   predicate(Matcher::vector_length(n) == 4);
 4723   match(Set dst (VectorInsert (Binary src val) idx));
 4724   effect(TEMP vtmp);
 4725   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4726   ins_encode %{
 4727     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4728     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4729 
 4730     uint x_idx = $idx$$constant & right_n_bits(1);
 4731     uint y_idx = ($idx$$constant >> 1) & 1;
 4732     int vlen_enc = Assembler::AVX_256bit;
 4733     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4734     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4735     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4736   %}
 4737   ins_pipe( pipe_slow );
 4738 %}
 4739 
 4740 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4741   predicate(Matcher::vector_length(n) == 8);
 4742   match(Set dst (VectorInsert (Binary src val) idx));
 4743   effect(TEMP vtmp);
 4744   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4745   ins_encode %{
 4746     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4747     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4748 
 4749     uint x_idx = $idx$$constant & right_n_bits(1);
 4750     uint y_idx = ($idx$$constant >> 1) & 3;
 4751     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4752     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4753     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4754   %}
 4755   ins_pipe( pipe_slow );
 4756 %}
 4757 #endif
 4758 
 4759 instruct insertF(vec dst, regF val, immU8 idx) %{
 4760   predicate(Matcher::vector_length(n) < 8);
 4761   match(Set dst (VectorInsert (Binary dst val) idx));
 4762   format %{ "vector_insert $dst,$val,$idx" %}
 4763   ins_encode %{
 4764     assert(UseSSE >= 4, "sanity");
 4765 
 4766     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4767     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4768 
 4769     uint x_idx = $idx$$constant & right_n_bits(2);
 4770     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4771   %}
 4772   ins_pipe( pipe_slow );
 4773 %}
 4774 
 4775 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4776   predicate(Matcher::vector_length(n) >= 8);
 4777   match(Set dst (VectorInsert (Binary src val) idx));
 4778   effect(TEMP vtmp);
 4779   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4780   ins_encode %{
 4781     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4782     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4783 
 4784     int vlen = Matcher::vector_length(this);
 4785     uint x_idx = $idx$$constant & right_n_bits(2);
 4786     if (vlen == 8) {
 4787       uint y_idx = ($idx$$constant >> 2) & 1;
 4788       int vlen_enc = Assembler::AVX_256bit;
 4789       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4790       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4791       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4792     } else {
 4793       assert(vlen == 16, "sanity");
 4794       uint y_idx = ($idx$$constant >> 2) & 3;
 4795       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4796       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4797       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4798     }
 4799   %}
 4800   ins_pipe( pipe_slow );
 4801 %}
 4802 
 4803 #ifdef _LP64
 4804 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4805   predicate(Matcher::vector_length(n) == 2);
 4806   match(Set dst (VectorInsert (Binary dst val) idx));
 4807   effect(TEMP tmp);
 4808   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4809   ins_encode %{
 4810     assert(UseSSE >= 4, "sanity");
 4811     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4812     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4813 
 4814     __ movq($tmp$$Register, $val$$XMMRegister);
 4815     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4816   %}
 4817   ins_pipe( pipe_slow );
 4818 %}
 4819 
 4820 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4821   predicate(Matcher::vector_length(n) == 4);
 4822   match(Set dst (VectorInsert (Binary src val) idx));
 4823   effect(TEMP vtmp, TEMP tmp);
 4824   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4825   ins_encode %{
 4826     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4827     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4828 
 4829     uint x_idx = $idx$$constant & right_n_bits(1);
 4830     uint y_idx = ($idx$$constant >> 1) & 1;
 4831     int vlen_enc = Assembler::AVX_256bit;
 4832     __ movq($tmp$$Register, $val$$XMMRegister);
 4833     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4834     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4835     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4836   %}
 4837   ins_pipe( pipe_slow );
 4838 %}
 4839 
 4840 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4841   predicate(Matcher::vector_length(n) == 8);
 4842   match(Set dst (VectorInsert (Binary src val) idx));
 4843   effect(TEMP tmp, TEMP vtmp);
 4844   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4845   ins_encode %{
 4846     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4847     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4848 
 4849     uint x_idx = $idx$$constant & right_n_bits(1);
 4850     uint y_idx = ($idx$$constant >> 1) & 3;
 4851     __ movq($tmp$$Register, $val$$XMMRegister);
 4852     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4853     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4854     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4855   %}
 4856   ins_pipe( pipe_slow );
 4857 %}
 4858 #endif
 4859 
 4860 // ====================REDUCTION ARITHMETIC=======================================
 4861 
 4862 // =======================Int Reduction==========================================
 4863 
 4864 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4865   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4866   match(Set dst (AddReductionVI src1 src2));
 4867   match(Set dst (MulReductionVI src1 src2));
 4868   match(Set dst (AndReductionV  src1 src2));
 4869   match(Set dst ( OrReductionV  src1 src2));
 4870   match(Set dst (XorReductionV  src1 src2));
 4871   match(Set dst (MinReductionV  src1 src2));
 4872   match(Set dst (MaxReductionV  src1 src2));
 4873   effect(TEMP vtmp1, TEMP vtmp2);
 4874   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4875   ins_encode %{
 4876     int opcode = this->ideal_Opcode();
 4877     int vlen = Matcher::vector_length(this, $src2);
 4878     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4879   %}
 4880   ins_pipe( pipe_slow );
 4881 %}
 4882 
 4883 // =======================Long Reduction==========================================
 4884 
 4885 #ifdef _LP64
 4886 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4887   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4888   match(Set dst (AddReductionVL src1 src2));
 4889   match(Set dst (MulReductionVL src1 src2));
 4890   match(Set dst (AndReductionV  src1 src2));
 4891   match(Set dst ( OrReductionV  src1 src2));
 4892   match(Set dst (XorReductionV  src1 src2));
 4893   match(Set dst (MinReductionV  src1 src2));
 4894   match(Set dst (MaxReductionV  src1 src2));
 4895   effect(TEMP vtmp1, TEMP vtmp2);
 4896   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4897   ins_encode %{
 4898     int opcode = this->ideal_Opcode();
 4899     int vlen = Matcher::vector_length(this, $src2);
 4900     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4901   %}
 4902   ins_pipe( pipe_slow );
 4903 %}
 4904 
 4905 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4906   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4907   match(Set dst (AddReductionVL src1 src2));
 4908   match(Set dst (MulReductionVL src1 src2));
 4909   match(Set dst (AndReductionV  src1 src2));
 4910   match(Set dst ( OrReductionV  src1 src2));
 4911   match(Set dst (XorReductionV  src1 src2));
 4912   match(Set dst (MinReductionV  src1 src2));
 4913   match(Set dst (MaxReductionV  src1 src2));
 4914   effect(TEMP vtmp1, TEMP vtmp2);
 4915   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4916   ins_encode %{
 4917     int opcode = this->ideal_Opcode();
 4918     int vlen = Matcher::vector_length(this, $src2);
 4919     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4920   %}
 4921   ins_pipe( pipe_slow );
 4922 %}
 4923 #endif // _LP64
 4924 
 4925 // =======================Float Reduction==========================================
 4926 
 4927 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4928   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4929   match(Set dst (AddReductionVF dst src));
 4930   match(Set dst (MulReductionVF dst src));
 4931   effect(TEMP dst, TEMP vtmp);
 4932   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4933   ins_encode %{
 4934     int opcode = this->ideal_Opcode();
 4935     int vlen = Matcher::vector_length(this, $src);
 4936     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4937   %}
 4938   ins_pipe( pipe_slow );
 4939 %}
 4940 
 4941 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4942   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4943   match(Set dst (AddReductionVF dst src));
 4944   match(Set dst (MulReductionVF dst src));
 4945   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4946   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4947   ins_encode %{
 4948     int opcode = this->ideal_Opcode();
 4949     int vlen = Matcher::vector_length(this, $src);
 4950     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4951   %}
 4952   ins_pipe( pipe_slow );
 4953 %}
 4954 
 4955 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4956   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4957   match(Set dst (AddReductionVF dst src));
 4958   match(Set dst (MulReductionVF dst src));
 4959   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4960   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4961   ins_encode %{
 4962     int opcode = this->ideal_Opcode();
 4963     int vlen = Matcher::vector_length(this, $src);
 4964     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4965   %}
 4966   ins_pipe( pipe_slow );
 4967 %}
 4968 
 4969 // =======================Double Reduction==========================================
 4970 
 4971 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4972   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 4973   match(Set dst (AddReductionVD dst src));
 4974   match(Set dst (MulReductionVD dst src));
 4975   effect(TEMP dst, TEMP vtmp);
 4976   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 4977   ins_encode %{
 4978     int opcode = this->ideal_Opcode();
 4979     int vlen = Matcher::vector_length(this, $src);
 4980     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4981 %}
 4982   ins_pipe( pipe_slow );
 4983 %}
 4984 
 4985 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 4986   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 4987   match(Set dst (AddReductionVD dst src));
 4988   match(Set dst (MulReductionVD dst src));
 4989   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4990   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4991   ins_encode %{
 4992     int opcode = this->ideal_Opcode();
 4993     int vlen = Matcher::vector_length(this, $src);
 4994     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4995   %}
 4996   ins_pipe( pipe_slow );
 4997 %}
 4998 
 4999 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5000   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 5001   match(Set dst (AddReductionVD dst src));
 5002   match(Set dst (MulReductionVD dst src));
 5003   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5004   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5005   ins_encode %{
 5006     int opcode = this->ideal_Opcode();
 5007     int vlen = Matcher::vector_length(this, $src);
 5008     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5009   %}
 5010   ins_pipe( pipe_slow );
 5011 %}
 5012 
 5013 // =======================Byte Reduction==========================================
 5014 
 5015 #ifdef _LP64
 5016 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5017   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5018   match(Set dst (AddReductionVI src1 src2));
 5019   match(Set dst (AndReductionV  src1 src2));
 5020   match(Set dst ( OrReductionV  src1 src2));
 5021   match(Set dst (XorReductionV  src1 src2));
 5022   match(Set dst (MinReductionV  src1 src2));
 5023   match(Set dst (MaxReductionV  src1 src2));
 5024   effect(TEMP vtmp1, TEMP vtmp2);
 5025   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5026   ins_encode %{
 5027     int opcode = this->ideal_Opcode();
 5028     int vlen = Matcher::vector_length(this, $src2);
 5029     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5030   %}
 5031   ins_pipe( pipe_slow );
 5032 %}
 5033 
 5034 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5035   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5036   match(Set dst (AddReductionVI src1 src2));
 5037   match(Set dst (AndReductionV  src1 src2));
 5038   match(Set dst ( OrReductionV  src1 src2));
 5039   match(Set dst (XorReductionV  src1 src2));
 5040   match(Set dst (MinReductionV  src1 src2));
 5041   match(Set dst (MaxReductionV  src1 src2));
 5042   effect(TEMP vtmp1, TEMP vtmp2);
 5043   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5044   ins_encode %{
 5045     int opcode = this->ideal_Opcode();
 5046     int vlen = Matcher::vector_length(this, $src2);
 5047     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5048   %}
 5049   ins_pipe( pipe_slow );
 5050 %}
 5051 #endif
 5052 
 5053 // =======================Short Reduction==========================================
 5054 
 5055 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5056   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5057   match(Set dst (AddReductionVI src1 src2));
 5058   match(Set dst (MulReductionVI src1 src2));
 5059   match(Set dst (AndReductionV  src1 src2));
 5060   match(Set dst ( OrReductionV  src1 src2));
 5061   match(Set dst (XorReductionV  src1 src2));
 5062   match(Set dst (MinReductionV  src1 src2));
 5063   match(Set dst (MaxReductionV  src1 src2));
 5064   effect(TEMP vtmp1, TEMP vtmp2);
 5065   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5066   ins_encode %{
 5067     int opcode = this->ideal_Opcode();
 5068     int vlen = Matcher::vector_length(this, $src2);
 5069     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5070   %}
 5071   ins_pipe( pipe_slow );
 5072 %}
 5073 
 5074 // =======================Mul Reduction==========================================
 5075 
 5076 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5077   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5078             Matcher::vector_length(n->in(2)) <= 32); // src2
 5079   match(Set dst (MulReductionVI src1 src2));
 5080   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5081   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5082   ins_encode %{
 5083     int opcode = this->ideal_Opcode();
 5084     int vlen = Matcher::vector_length(this, $src2);
 5085     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5086   %}
 5087   ins_pipe( pipe_slow );
 5088 %}
 5089 
 5090 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5091   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5092             Matcher::vector_length(n->in(2)) == 64); // src2
 5093   match(Set dst (MulReductionVI src1 src2));
 5094   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5095   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5096   ins_encode %{
 5097     int opcode = this->ideal_Opcode();
 5098     int vlen = Matcher::vector_length(this, $src2);
 5099     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5100   %}
 5101   ins_pipe( pipe_slow );
 5102 %}
 5103 
 5104 //--------------------Min/Max Float Reduction --------------------
 5105 // Float Min Reduction
 5106 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5107                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5108   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5109             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5110              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5111             Matcher::vector_length(n->in(2)) == 2);
 5112   match(Set dst (MinReductionV src1 src2));
 5113   match(Set dst (MaxReductionV src1 src2));
 5114   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5115   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5116   ins_encode %{
 5117     assert(UseAVX > 0, "sanity");
 5118 
 5119     int opcode = this->ideal_Opcode();
 5120     int vlen = Matcher::vector_length(this, $src2);
 5121     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5122                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5123   %}
 5124   ins_pipe( pipe_slow );
 5125 %}
 5126 
 5127 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5128                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5129   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5130             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5131              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5132             Matcher::vector_length(n->in(2)) >= 4);
 5133   match(Set dst (MinReductionV src1 src2));
 5134   match(Set dst (MaxReductionV src1 src2));
 5135   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5136   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5137   ins_encode %{
 5138     assert(UseAVX > 0, "sanity");
 5139 
 5140     int opcode = this->ideal_Opcode();
 5141     int vlen = Matcher::vector_length(this, $src2);
 5142     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5143                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5144   %}
 5145   ins_pipe( pipe_slow );
 5146 %}
 5147 
 5148 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5149                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5150   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5151             Matcher::vector_length(n->in(2)) == 2);
 5152   match(Set dst (MinReductionV dst src));
 5153   match(Set dst (MaxReductionV dst src));
 5154   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5155   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5156   ins_encode %{
 5157     assert(UseAVX > 0, "sanity");
 5158 
 5159     int opcode = this->ideal_Opcode();
 5160     int vlen = Matcher::vector_length(this, $src);
 5161     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5162                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5163   %}
 5164   ins_pipe( pipe_slow );
 5165 %}
 5166 
 5167 
 5168 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5169                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5170   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5171             Matcher::vector_length(n->in(2)) >= 4);
 5172   match(Set dst (MinReductionV dst src));
 5173   match(Set dst (MaxReductionV dst src));
 5174   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5175   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5176   ins_encode %{
 5177     assert(UseAVX > 0, "sanity");
 5178 
 5179     int opcode = this->ideal_Opcode();
 5180     int vlen = Matcher::vector_length(this, $src);
 5181     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5182                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5183   %}
 5184   ins_pipe( pipe_slow );
 5185 %}
 5186 
 5187 
 5188 //--------------------Min Double Reduction --------------------
 5189 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5190                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5191                             rFlagsReg cr) %{
 5192   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5193             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5194              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5195             Matcher::vector_length(n->in(2)) == 2);
 5196   match(Set dst (MinReductionV src1 src2));
 5197   match(Set dst (MaxReductionV src1 src2));
 5198   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5199   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5200   ins_encode %{
 5201     assert(UseAVX > 0, "sanity");
 5202 
 5203     int opcode = this->ideal_Opcode();
 5204     int vlen = Matcher::vector_length(this, $src2);
 5205     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5206                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5207   %}
 5208   ins_pipe( pipe_slow );
 5209 %}
 5210 
 5211 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5212                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5213                            rFlagsReg cr) %{
 5214   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5215             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5216              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5217             Matcher::vector_length(n->in(2)) >= 4);
 5218   match(Set dst (MinReductionV src1 src2));
 5219   match(Set dst (MaxReductionV src1 src2));
 5220   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5221   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5222   ins_encode %{
 5223     assert(UseAVX > 0, "sanity");
 5224 
 5225     int opcode = this->ideal_Opcode();
 5226     int vlen = Matcher::vector_length(this, $src2);
 5227     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5228                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5229   %}
 5230   ins_pipe( pipe_slow );
 5231 %}
 5232 
 5233 
 5234 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5235                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5236                                rFlagsReg cr) %{
 5237   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5238             Matcher::vector_length(n->in(2)) == 2);
 5239   match(Set dst (MinReductionV dst src));
 5240   match(Set dst (MaxReductionV dst src));
 5241   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5242   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5243   ins_encode %{
 5244     assert(UseAVX > 0, "sanity");
 5245 
 5246     int opcode = this->ideal_Opcode();
 5247     int vlen = Matcher::vector_length(this, $src);
 5248     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5249                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5250   %}
 5251   ins_pipe( pipe_slow );
 5252 %}
 5253 
 5254 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5255                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5256                               rFlagsReg cr) %{
 5257   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5258             Matcher::vector_length(n->in(2)) >= 4);
 5259   match(Set dst (MinReductionV dst src));
 5260   match(Set dst (MaxReductionV dst src));
 5261   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5262   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5263   ins_encode %{
 5264     assert(UseAVX > 0, "sanity");
 5265 
 5266     int opcode = this->ideal_Opcode();
 5267     int vlen = Matcher::vector_length(this, $src);
 5268     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5269                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5270   %}
 5271   ins_pipe( pipe_slow );
 5272 %}
 5273 
 5274 // ====================VECTOR ARITHMETIC=======================================
 5275 
 5276 // --------------------------------- ADD --------------------------------------
 5277 
 5278 // Bytes vector add
 5279 instruct vaddB(vec dst, vec src) %{
 5280   predicate(UseAVX == 0);
 5281   match(Set dst (AddVB dst src));
 5282   format %{ "paddb   $dst,$src\t! add packedB" %}
 5283   ins_encode %{
 5284     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5285   %}
 5286   ins_pipe( pipe_slow );
 5287 %}
 5288 
 5289 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5290   predicate(UseAVX > 0);
 5291   match(Set dst (AddVB src1 src2));
 5292   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5293   ins_encode %{
 5294     int vlen_enc = vector_length_encoding(this);
 5295     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5296   %}
 5297   ins_pipe( pipe_slow );
 5298 %}
 5299 
 5300 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5301   predicate((UseAVX > 0) &&
 5302             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5303   match(Set dst (AddVB src (LoadVector mem)));
 5304   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5305   ins_encode %{
 5306     int vlen_enc = vector_length_encoding(this);
 5307     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5308   %}
 5309   ins_pipe( pipe_slow );
 5310 %}
 5311 
 5312 // Shorts/Chars vector add
 5313 instruct vaddS(vec dst, vec src) %{
 5314   predicate(UseAVX == 0);
 5315   match(Set dst (AddVS dst src));
 5316   format %{ "paddw   $dst,$src\t! add packedS" %}
 5317   ins_encode %{
 5318     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5319   %}
 5320   ins_pipe( pipe_slow );
 5321 %}
 5322 
 5323 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5324   predicate(UseAVX > 0);
 5325   match(Set dst (AddVS src1 src2));
 5326   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5327   ins_encode %{
 5328     int vlen_enc = vector_length_encoding(this);
 5329     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5330   %}
 5331   ins_pipe( pipe_slow );
 5332 %}
 5333 
 5334 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5335   predicate((UseAVX > 0) &&
 5336             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5337   match(Set dst (AddVS src (LoadVector mem)));
 5338   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5339   ins_encode %{
 5340     int vlen_enc = vector_length_encoding(this);
 5341     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5342   %}
 5343   ins_pipe( pipe_slow );
 5344 %}
 5345 
 5346 // Integers vector add
 5347 instruct vaddI(vec dst, vec src) %{
 5348   predicate(UseAVX == 0);
 5349   match(Set dst (AddVI dst src));
 5350   format %{ "paddd   $dst,$src\t! add packedI" %}
 5351   ins_encode %{
 5352     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5353   %}
 5354   ins_pipe( pipe_slow );
 5355 %}
 5356 
 5357 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5358   predicate(UseAVX > 0);
 5359   match(Set dst (AddVI src1 src2));
 5360   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5361   ins_encode %{
 5362     int vlen_enc = vector_length_encoding(this);
 5363     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5364   %}
 5365   ins_pipe( pipe_slow );
 5366 %}
 5367 
 5368 
 5369 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5370   predicate((UseAVX > 0) &&
 5371             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5372   match(Set dst (AddVI src (LoadVector mem)));
 5373   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5374   ins_encode %{
 5375     int vlen_enc = vector_length_encoding(this);
 5376     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5377   %}
 5378   ins_pipe( pipe_slow );
 5379 %}
 5380 
 5381 // Longs vector add
 5382 instruct vaddL(vec dst, vec src) %{
 5383   predicate(UseAVX == 0);
 5384   match(Set dst (AddVL dst src));
 5385   format %{ "paddq   $dst,$src\t! add packedL" %}
 5386   ins_encode %{
 5387     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5388   %}
 5389   ins_pipe( pipe_slow );
 5390 %}
 5391 
 5392 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5393   predicate(UseAVX > 0);
 5394   match(Set dst (AddVL src1 src2));
 5395   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5396   ins_encode %{
 5397     int vlen_enc = vector_length_encoding(this);
 5398     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5399   %}
 5400   ins_pipe( pipe_slow );
 5401 %}
 5402 
 5403 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5404   predicate((UseAVX > 0) &&
 5405             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5406   match(Set dst (AddVL src (LoadVector mem)));
 5407   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5408   ins_encode %{
 5409     int vlen_enc = vector_length_encoding(this);
 5410     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5411   %}
 5412   ins_pipe( pipe_slow );
 5413 %}
 5414 
 5415 // Floats vector add
 5416 instruct vaddF(vec dst, vec src) %{
 5417   predicate(UseAVX == 0);
 5418   match(Set dst (AddVF dst src));
 5419   format %{ "addps   $dst,$src\t! add packedF" %}
 5420   ins_encode %{
 5421     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5422   %}
 5423   ins_pipe( pipe_slow );
 5424 %}
 5425 
 5426 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5427   predicate(UseAVX > 0);
 5428   match(Set dst (AddVF src1 src2));
 5429   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5430   ins_encode %{
 5431     int vlen_enc = vector_length_encoding(this);
 5432     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5433   %}
 5434   ins_pipe( pipe_slow );
 5435 %}
 5436 
 5437 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5438   predicate((UseAVX > 0) &&
 5439             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5440   match(Set dst (AddVF src (LoadVector mem)));
 5441   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5442   ins_encode %{
 5443     int vlen_enc = vector_length_encoding(this);
 5444     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5445   %}
 5446   ins_pipe( pipe_slow );
 5447 %}
 5448 
 5449 // Doubles vector add
 5450 instruct vaddD(vec dst, vec src) %{
 5451   predicate(UseAVX == 0);
 5452   match(Set dst (AddVD dst src));
 5453   format %{ "addpd   $dst,$src\t! add packedD" %}
 5454   ins_encode %{
 5455     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5456   %}
 5457   ins_pipe( pipe_slow );
 5458 %}
 5459 
 5460 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5461   predicate(UseAVX > 0);
 5462   match(Set dst (AddVD src1 src2));
 5463   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5464   ins_encode %{
 5465     int vlen_enc = vector_length_encoding(this);
 5466     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5467   %}
 5468   ins_pipe( pipe_slow );
 5469 %}
 5470 
 5471 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5472   predicate((UseAVX > 0) &&
 5473             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5474   match(Set dst (AddVD src (LoadVector mem)));
 5475   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5476   ins_encode %{
 5477     int vlen_enc = vector_length_encoding(this);
 5478     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5479   %}
 5480   ins_pipe( pipe_slow );
 5481 %}
 5482 
 5483 // --------------------------------- SUB --------------------------------------
 5484 
 5485 // Bytes vector sub
 5486 instruct vsubB(vec dst, vec src) %{
 5487   predicate(UseAVX == 0);
 5488   match(Set dst (SubVB dst src));
 5489   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5490   ins_encode %{
 5491     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5492   %}
 5493   ins_pipe( pipe_slow );
 5494 %}
 5495 
 5496 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5497   predicate(UseAVX > 0);
 5498   match(Set dst (SubVB src1 src2));
 5499   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5500   ins_encode %{
 5501     int vlen_enc = vector_length_encoding(this);
 5502     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5503   %}
 5504   ins_pipe( pipe_slow );
 5505 %}
 5506 
 5507 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5508   predicate((UseAVX > 0) &&
 5509             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5510   match(Set dst (SubVB src (LoadVector mem)));
 5511   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5512   ins_encode %{
 5513     int vlen_enc = vector_length_encoding(this);
 5514     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5515   %}
 5516   ins_pipe( pipe_slow );
 5517 %}
 5518 
 5519 // Shorts/Chars vector sub
 5520 instruct vsubS(vec dst, vec src) %{
 5521   predicate(UseAVX == 0);
 5522   match(Set dst (SubVS dst src));
 5523   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5524   ins_encode %{
 5525     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5526   %}
 5527   ins_pipe( pipe_slow );
 5528 %}
 5529 
 5530 
 5531 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5532   predicate(UseAVX > 0);
 5533   match(Set dst (SubVS src1 src2));
 5534   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5535   ins_encode %{
 5536     int vlen_enc = vector_length_encoding(this);
 5537     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5538   %}
 5539   ins_pipe( pipe_slow );
 5540 %}
 5541 
 5542 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5543   predicate((UseAVX > 0) &&
 5544             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5545   match(Set dst (SubVS src (LoadVector mem)));
 5546   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5547   ins_encode %{
 5548     int vlen_enc = vector_length_encoding(this);
 5549     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5550   %}
 5551   ins_pipe( pipe_slow );
 5552 %}
 5553 
 5554 // Integers vector sub
 5555 instruct vsubI(vec dst, vec src) %{
 5556   predicate(UseAVX == 0);
 5557   match(Set dst (SubVI dst src));
 5558   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5559   ins_encode %{
 5560     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5561   %}
 5562   ins_pipe( pipe_slow );
 5563 %}
 5564 
 5565 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5566   predicate(UseAVX > 0);
 5567   match(Set dst (SubVI src1 src2));
 5568   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5569   ins_encode %{
 5570     int vlen_enc = vector_length_encoding(this);
 5571     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5572   %}
 5573   ins_pipe( pipe_slow );
 5574 %}
 5575 
 5576 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5577   predicate((UseAVX > 0) &&
 5578             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5579   match(Set dst (SubVI src (LoadVector mem)));
 5580   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5581   ins_encode %{
 5582     int vlen_enc = vector_length_encoding(this);
 5583     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5584   %}
 5585   ins_pipe( pipe_slow );
 5586 %}
 5587 
 5588 // Longs vector sub
 5589 instruct vsubL(vec dst, vec src) %{
 5590   predicate(UseAVX == 0);
 5591   match(Set dst (SubVL dst src));
 5592   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5593   ins_encode %{
 5594     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5595   %}
 5596   ins_pipe( pipe_slow );
 5597 %}
 5598 
 5599 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5600   predicate(UseAVX > 0);
 5601   match(Set dst (SubVL src1 src2));
 5602   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5603   ins_encode %{
 5604     int vlen_enc = vector_length_encoding(this);
 5605     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5606   %}
 5607   ins_pipe( pipe_slow );
 5608 %}
 5609 
 5610 
 5611 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5612   predicate((UseAVX > 0) &&
 5613             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5614   match(Set dst (SubVL src (LoadVector mem)));
 5615   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5616   ins_encode %{
 5617     int vlen_enc = vector_length_encoding(this);
 5618     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5619   %}
 5620   ins_pipe( pipe_slow );
 5621 %}
 5622 
 5623 // Floats vector sub
 5624 instruct vsubF(vec dst, vec src) %{
 5625   predicate(UseAVX == 0);
 5626   match(Set dst (SubVF dst src));
 5627   format %{ "subps   $dst,$src\t! sub packedF" %}
 5628   ins_encode %{
 5629     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5630   %}
 5631   ins_pipe( pipe_slow );
 5632 %}
 5633 
 5634 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5635   predicate(UseAVX > 0);
 5636   match(Set dst (SubVF src1 src2));
 5637   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5638   ins_encode %{
 5639     int vlen_enc = vector_length_encoding(this);
 5640     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5641   %}
 5642   ins_pipe( pipe_slow );
 5643 %}
 5644 
 5645 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5646   predicate((UseAVX > 0) &&
 5647             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5648   match(Set dst (SubVF src (LoadVector mem)));
 5649   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5650   ins_encode %{
 5651     int vlen_enc = vector_length_encoding(this);
 5652     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5653   %}
 5654   ins_pipe( pipe_slow );
 5655 %}
 5656 
 5657 // Doubles vector sub
 5658 instruct vsubD(vec dst, vec src) %{
 5659   predicate(UseAVX == 0);
 5660   match(Set dst (SubVD dst src));
 5661   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5662   ins_encode %{
 5663     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5664   %}
 5665   ins_pipe( pipe_slow );
 5666 %}
 5667 
 5668 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5669   predicate(UseAVX > 0);
 5670   match(Set dst (SubVD src1 src2));
 5671   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5672   ins_encode %{
 5673     int vlen_enc = vector_length_encoding(this);
 5674     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5675   %}
 5676   ins_pipe( pipe_slow );
 5677 %}
 5678 
 5679 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5680   predicate((UseAVX > 0) &&
 5681             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5682   match(Set dst (SubVD src (LoadVector mem)));
 5683   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5684   ins_encode %{
 5685     int vlen_enc = vector_length_encoding(this);
 5686     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5687   %}
 5688   ins_pipe( pipe_slow );
 5689 %}
 5690 
 5691 // --------------------------------- MUL --------------------------------------
 5692 
 5693 // Byte vector mul
 5694 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5695   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5696   match(Set dst (MulVB src1 src2));
 5697   effect(TEMP dst, TEMP xtmp);
 5698   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5699   ins_encode %{
 5700     assert(UseSSE > 3, "required");
 5701     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5702     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5703     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5704     __ psllw($dst$$XMMRegister, 8);
 5705     __ psrlw($dst$$XMMRegister, 8);
 5706     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5707   %}
 5708   ins_pipe( pipe_slow );
 5709 %}
 5710 
 5711 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5712   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5713   match(Set dst (MulVB src1 src2));
 5714   effect(TEMP dst, TEMP xtmp);
 5715   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5716   ins_encode %{
 5717     assert(UseSSE > 3, "required");
 5718     // Odd-index elements
 5719     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5720     __ psrlw($dst$$XMMRegister, 8);
 5721     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5722     __ psrlw($xtmp$$XMMRegister, 8);
 5723     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5724     __ psllw($dst$$XMMRegister, 8);
 5725     // Even-index elements
 5726     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5727     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5728     __ psllw($xtmp$$XMMRegister, 8);
 5729     __ psrlw($xtmp$$XMMRegister, 8);
 5730     // Combine
 5731     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5732   %}
 5733   ins_pipe( pipe_slow );
 5734 %}
 5735 
 5736 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5737   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5738   match(Set dst (MulVB src1 src2));
 5739   effect(TEMP xtmp1, TEMP xtmp2);
 5740   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5741   ins_encode %{
 5742     int vlen_enc = vector_length_encoding(this);
 5743     // Odd-index elements
 5744     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5745     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5746     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5747     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5748     // Even-index elements
 5749     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5750     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5751     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5752     // Combine
 5753     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5754   %}
 5755   ins_pipe( pipe_slow );
 5756 %}
 5757 
 5758 // Shorts/Chars vector mul
 5759 instruct vmulS(vec dst, vec src) %{
 5760   predicate(UseAVX == 0);
 5761   match(Set dst (MulVS dst src));
 5762   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5763   ins_encode %{
 5764     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5765   %}
 5766   ins_pipe( pipe_slow );
 5767 %}
 5768 
 5769 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5770   predicate(UseAVX > 0);
 5771   match(Set dst (MulVS src1 src2));
 5772   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5773   ins_encode %{
 5774     int vlen_enc = vector_length_encoding(this);
 5775     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5776   %}
 5777   ins_pipe( pipe_slow );
 5778 %}
 5779 
 5780 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5781   predicate((UseAVX > 0) &&
 5782             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5783   match(Set dst (MulVS src (LoadVector mem)));
 5784   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5785   ins_encode %{
 5786     int vlen_enc = vector_length_encoding(this);
 5787     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5788   %}
 5789   ins_pipe( pipe_slow );
 5790 %}
 5791 
 5792 // Integers vector mul
 5793 instruct vmulI(vec dst, vec src) %{
 5794   predicate(UseAVX == 0);
 5795   match(Set dst (MulVI dst src));
 5796   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5797   ins_encode %{
 5798     assert(UseSSE > 3, "required");
 5799     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5800   %}
 5801   ins_pipe( pipe_slow );
 5802 %}
 5803 
 5804 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5805   predicate(UseAVX > 0);
 5806   match(Set dst (MulVI src1 src2));
 5807   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5808   ins_encode %{
 5809     int vlen_enc = vector_length_encoding(this);
 5810     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5811   %}
 5812   ins_pipe( pipe_slow );
 5813 %}
 5814 
 5815 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5816   predicate((UseAVX > 0) &&
 5817             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5818   match(Set dst (MulVI src (LoadVector mem)));
 5819   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5820   ins_encode %{
 5821     int vlen_enc = vector_length_encoding(this);
 5822     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5823   %}
 5824   ins_pipe( pipe_slow );
 5825 %}
 5826 
 5827 // Longs vector mul
 5828 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 5829   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5830              VM_Version::supports_avx512dq()) ||
 5831             VM_Version::supports_avx512vldq());
 5832   match(Set dst (MulVL src1 src2));
 5833   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 5834   ins_encode %{
 5835     assert(UseAVX > 2, "required");
 5836     int vlen_enc = vector_length_encoding(this);
 5837     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5838   %}
 5839   ins_pipe( pipe_slow );
 5840 %}
 5841 
 5842 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 5843   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5844              VM_Version::supports_avx512dq()) ||
 5845             (Matcher::vector_length_in_bytes(n) > 8 &&
 5846              VM_Version::supports_avx512vldq()));
 5847   match(Set dst (MulVL src (LoadVector mem)));
 5848   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 5849   ins_encode %{
 5850     assert(UseAVX > 2, "required");
 5851     int vlen_enc = vector_length_encoding(this);
 5852     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5853   %}
 5854   ins_pipe( pipe_slow );
 5855 %}
 5856 
 5857 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 5858   predicate(UseAVX == 0);
 5859   match(Set dst (MulVL src1 src2));
 5860   effect(TEMP dst, TEMP xtmp);
 5861   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5862   ins_encode %{
 5863     assert(VM_Version::supports_sse4_1(), "required");
 5864     // Get the lo-hi products, only the lower 32 bits is in concerns
 5865     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 5866     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 5867     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 5868     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 5869     __ psllq($dst$$XMMRegister, 32);
 5870     // Get the lo-lo products
 5871     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5872     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 5873     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 5874   %}
 5875   ins_pipe( pipe_slow );
 5876 %}
 5877 
 5878 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5879   predicate(UseAVX > 0 &&
 5880             ((Matcher::vector_length_in_bytes(n) == 64 &&
 5881               !VM_Version::supports_avx512dq()) ||
 5882              (Matcher::vector_length_in_bytes(n) < 64 &&
 5883               !VM_Version::supports_avx512vldq())));
 5884   match(Set dst (MulVL src1 src2));
 5885   effect(TEMP xtmp1, TEMP xtmp2);
 5886   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5887   ins_encode %{
 5888     int vlen_enc = vector_length_encoding(this);
 5889     // Get the lo-hi products, only the lower 32 bits is in concerns
 5890     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 5891     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5892     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 5893     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5894     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 5895     // Get the lo-lo products
 5896     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5897     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5898   %}
 5899   ins_pipe( pipe_slow );
 5900 %}
 5901 
 5902 // Floats vector mul
 5903 instruct vmulF(vec dst, vec src) %{
 5904   predicate(UseAVX == 0);
 5905   match(Set dst (MulVF dst src));
 5906   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5907   ins_encode %{
 5908     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5909   %}
 5910   ins_pipe( pipe_slow );
 5911 %}
 5912 
 5913 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5914   predicate(UseAVX > 0);
 5915   match(Set dst (MulVF src1 src2));
 5916   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5917   ins_encode %{
 5918     int vlen_enc = vector_length_encoding(this);
 5919     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5920   %}
 5921   ins_pipe( pipe_slow );
 5922 %}
 5923 
 5924 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5925   predicate((UseAVX > 0) &&
 5926             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5927   match(Set dst (MulVF src (LoadVector mem)));
 5928   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5929   ins_encode %{
 5930     int vlen_enc = vector_length_encoding(this);
 5931     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5932   %}
 5933   ins_pipe( pipe_slow );
 5934 %}
 5935 
 5936 // Doubles vector mul
 5937 instruct vmulD(vec dst, vec src) %{
 5938   predicate(UseAVX == 0);
 5939   match(Set dst (MulVD dst src));
 5940   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5941   ins_encode %{
 5942     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5943   %}
 5944   ins_pipe( pipe_slow );
 5945 %}
 5946 
 5947 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5948   predicate(UseAVX > 0);
 5949   match(Set dst (MulVD src1 src2));
 5950   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5951   ins_encode %{
 5952     int vlen_enc = vector_length_encoding(this);
 5953     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5954   %}
 5955   ins_pipe( pipe_slow );
 5956 %}
 5957 
 5958 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5959   predicate((UseAVX > 0) &&
 5960             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5961   match(Set dst (MulVD src (LoadVector mem)));
 5962   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 5963   ins_encode %{
 5964     int vlen_enc = vector_length_encoding(this);
 5965     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5966   %}
 5967   ins_pipe( pipe_slow );
 5968 %}
 5969 
 5970 // --------------------------------- DIV --------------------------------------
 5971 
 5972 // Floats vector div
 5973 instruct vdivF(vec dst, vec src) %{
 5974   predicate(UseAVX == 0);
 5975   match(Set dst (DivVF dst src));
 5976   format %{ "divps   $dst,$src\t! div packedF" %}
 5977   ins_encode %{
 5978     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 5979   %}
 5980   ins_pipe( pipe_slow );
 5981 %}
 5982 
 5983 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 5984   predicate(UseAVX > 0);
 5985   match(Set dst (DivVF src1 src2));
 5986   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 5987   ins_encode %{
 5988     int vlen_enc = vector_length_encoding(this);
 5989     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5990   %}
 5991   ins_pipe( pipe_slow );
 5992 %}
 5993 
 5994 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 5995   predicate((UseAVX > 0) &&
 5996             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5997   match(Set dst (DivVF src (LoadVector mem)));
 5998   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 5999   ins_encode %{
 6000     int vlen_enc = vector_length_encoding(this);
 6001     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6002   %}
 6003   ins_pipe( pipe_slow );
 6004 %}
 6005 
 6006 // Doubles vector div
 6007 instruct vdivD(vec dst, vec src) %{
 6008   predicate(UseAVX == 0);
 6009   match(Set dst (DivVD dst src));
 6010   format %{ "divpd   $dst,$src\t! div packedD" %}
 6011   ins_encode %{
 6012     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6013   %}
 6014   ins_pipe( pipe_slow );
 6015 %}
 6016 
 6017 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6018   predicate(UseAVX > 0);
 6019   match(Set dst (DivVD src1 src2));
 6020   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6021   ins_encode %{
 6022     int vlen_enc = vector_length_encoding(this);
 6023     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6024   %}
 6025   ins_pipe( pipe_slow );
 6026 %}
 6027 
 6028 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6029   predicate((UseAVX > 0) &&
 6030             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6031   match(Set dst (DivVD src (LoadVector mem)));
 6032   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6033   ins_encode %{
 6034     int vlen_enc = vector_length_encoding(this);
 6035     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6036   %}
 6037   ins_pipe( pipe_slow );
 6038 %}
 6039 
 6040 // ------------------------------ MinMax ---------------------------------------
 6041 
 6042 // Byte, Short, Int vector Min/Max
 6043 instruct minmax_reg_sse(vec dst, vec src) %{
 6044   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6045             UseAVX == 0);
 6046   match(Set dst (MinV dst src));
 6047   match(Set dst (MaxV dst src));
 6048   format %{ "vector_minmax  $dst,$src\t!  " %}
 6049   ins_encode %{
 6050     assert(UseSSE >= 4, "required");
 6051 
 6052     int opcode = this->ideal_Opcode();
 6053     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6054     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6055   %}
 6056   ins_pipe( pipe_slow );
 6057 %}
 6058 
 6059 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6060   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6061             UseAVX > 0);
 6062   match(Set dst (MinV src1 src2));
 6063   match(Set dst (MaxV src1 src2));
 6064   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6065   ins_encode %{
 6066     int opcode = this->ideal_Opcode();
 6067     int vlen_enc = vector_length_encoding(this);
 6068     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6069 
 6070     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6071   %}
 6072   ins_pipe( pipe_slow );
 6073 %}
 6074 
 6075 // Long vector Min/Max
 6076 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6077   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6078             UseAVX == 0);
 6079   match(Set dst (MinV dst src));
 6080   match(Set dst (MaxV src dst));
 6081   effect(TEMP dst, TEMP tmp);
 6082   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6083   ins_encode %{
 6084     assert(UseSSE >= 4, "required");
 6085 
 6086     int opcode = this->ideal_Opcode();
 6087     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6088     assert(elem_bt == T_LONG, "sanity");
 6089 
 6090     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6091   %}
 6092   ins_pipe( pipe_slow );
 6093 %}
 6094 
 6095 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6096   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6097             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6098   match(Set dst (MinV src1 src2));
 6099   match(Set dst (MaxV src1 src2));
 6100   effect(TEMP dst);
 6101   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6102   ins_encode %{
 6103     int vlen_enc = vector_length_encoding(this);
 6104     int opcode = this->ideal_Opcode();
 6105     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6106     assert(elem_bt == T_LONG, "sanity");
 6107 
 6108     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6109   %}
 6110   ins_pipe( pipe_slow );
 6111 %}
 6112 
 6113 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6114   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6115             Matcher::vector_element_basic_type(n) == T_LONG);
 6116   match(Set dst (MinV src1 src2));
 6117   match(Set dst (MaxV src1 src2));
 6118   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6119   ins_encode %{
 6120     assert(UseAVX > 2, "required");
 6121 
 6122     int vlen_enc = vector_length_encoding(this);
 6123     int opcode = this->ideal_Opcode();
 6124     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6125     assert(elem_bt == T_LONG, "sanity");
 6126 
 6127     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6128   %}
 6129   ins_pipe( pipe_slow );
 6130 %}
 6131 
 6132 // Float/Double vector Min/Max
 6133 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6134   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6135             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6136             UseAVX > 0);
 6137   match(Set dst (MinV a b));
 6138   match(Set dst (MaxV a b));
 6139   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6140   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6141   ins_encode %{
 6142     assert(UseAVX > 0, "required");
 6143 
 6144     int opcode = this->ideal_Opcode();
 6145     int vlen_enc = vector_length_encoding(this);
 6146     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6147 
 6148     __ vminmax_fp(opcode, elem_bt,
 6149                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6150                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6151   %}
 6152   ins_pipe( pipe_slow );
 6153 %}
 6154 
 6155 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6156   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6157             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6158   match(Set dst (MinV a b));
 6159   match(Set dst (MaxV a b));
 6160   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6161   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6162   ins_encode %{
 6163     assert(UseAVX > 2, "required");
 6164 
 6165     int opcode = this->ideal_Opcode();
 6166     int vlen_enc = vector_length_encoding(this);
 6167     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6168 
 6169     __ evminmax_fp(opcode, elem_bt,
 6170                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6171                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6172   %}
 6173   ins_pipe( pipe_slow );
 6174 %}
 6175 
 6176 // --------------------------------- Signum/CopySign ---------------------------
 6177 
 6178 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6179   match(Set dst (SignumF dst (Binary zero one)));
 6180   effect(KILL cr);
 6181   format %{ "signumF $dst, $dst" %}
 6182   ins_encode %{
 6183     int opcode = this->ideal_Opcode();
 6184     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6185   %}
 6186   ins_pipe( pipe_slow );
 6187 %}
 6188 
 6189 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6190   match(Set dst (SignumD dst (Binary zero one)));
 6191   effect(KILL cr);
 6192   format %{ "signumD $dst, $dst" %}
 6193   ins_encode %{
 6194     int opcode = this->ideal_Opcode();
 6195     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6196   %}
 6197   ins_pipe( pipe_slow );
 6198 %}
 6199 
 6200 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6201   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6202   match(Set dst (SignumVF src (Binary zero one)));
 6203   match(Set dst (SignumVD src (Binary zero one)));
 6204   effect(TEMP dst, TEMP xtmp1);
 6205   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6206   ins_encode %{
 6207     int opcode = this->ideal_Opcode();
 6208     int vec_enc = vector_length_encoding(this);
 6209     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6210                          $xtmp1$$XMMRegister, vec_enc);
 6211   %}
 6212   ins_pipe( pipe_slow );
 6213 %}
 6214 
 6215 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6216   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6217   match(Set dst (SignumVF src (Binary zero one)));
 6218   match(Set dst (SignumVD src (Binary zero one)));
 6219   effect(TEMP dst, TEMP ktmp1);
 6220   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6221   ins_encode %{
 6222     int opcode = this->ideal_Opcode();
 6223     int vec_enc = vector_length_encoding(this);
 6224     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6225                           $ktmp1$$KRegister, vec_enc);
 6226   %}
 6227   ins_pipe( pipe_slow );
 6228 %}
 6229 
 6230 // ---------------------------------------
 6231 // For copySign use 0xE4 as writemask for vpternlog
 6232 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6233 // C (xmm2) is set to 0x7FFFFFFF
 6234 // Wherever xmm2 is 0, we want to pick from B (sign)
 6235 // Wherever xmm2 is 1, we want to pick from A (src)
 6236 //
 6237 // A B C Result
 6238 // 0 0 0 0
 6239 // 0 0 1 0
 6240 // 0 1 0 1
 6241 // 0 1 1 0
 6242 // 1 0 0 0
 6243 // 1 0 1 1
 6244 // 1 1 0 1
 6245 // 1 1 1 1
 6246 //
 6247 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6248 // ---------------------------------------
 6249 
 6250 #ifdef _LP64
 6251 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6252   match(Set dst (CopySignF dst src));
 6253   effect(TEMP tmp1, TEMP tmp2);
 6254   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6255   ins_encode %{
 6256     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6257     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6258     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6259   %}
 6260   ins_pipe( pipe_slow );
 6261 %}
 6262 
 6263 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6264   match(Set dst (CopySignD dst (Binary src zero)));
 6265   ins_cost(100);
 6266   effect(TEMP tmp1, TEMP tmp2);
 6267   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6268   ins_encode %{
 6269     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6270     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6271     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6272   %}
 6273   ins_pipe( pipe_slow );
 6274 %}
 6275 
 6276 #endif // _LP64
 6277 
 6278 //----------------------------- CompressBits/ExpandBits ------------------------
 6279 
 6280 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6281   predicate(n->bottom_type()->isa_int());
 6282   match(Set dst (CompressBits src mask));
 6283   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6284   ins_encode %{
 6285     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6286   %}
 6287   ins_pipe( pipe_slow );
 6288 %}
 6289 
 6290 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6291   predicate(n->bottom_type()->isa_int());
 6292   match(Set dst (ExpandBits src mask));
 6293   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6294   ins_encode %{
 6295     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6296   %}
 6297   ins_pipe( pipe_slow );
 6298 %}
 6299 
 6300 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6301   predicate(n->bottom_type()->isa_int());
 6302   match(Set dst (CompressBits src (LoadI mask)));
 6303   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6304   ins_encode %{
 6305     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6306   %}
 6307   ins_pipe( pipe_slow );
 6308 %}
 6309 
 6310 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6311   predicate(n->bottom_type()->isa_int());
 6312   match(Set dst (ExpandBits src (LoadI mask)));
 6313   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6314   ins_encode %{
 6315     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6316   %}
 6317   ins_pipe( pipe_slow );
 6318 %}
 6319 
 6320 // --------------------------------- Sqrt --------------------------------------
 6321 
 6322 instruct vsqrtF_reg(vec dst, vec src) %{
 6323   match(Set dst (SqrtVF src));
 6324   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6325   ins_encode %{
 6326     assert(UseAVX > 0, "required");
 6327     int vlen_enc = vector_length_encoding(this);
 6328     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6329   %}
 6330   ins_pipe( pipe_slow );
 6331 %}
 6332 
 6333 instruct vsqrtF_mem(vec dst, memory mem) %{
 6334   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6335   match(Set dst (SqrtVF (LoadVector mem)));
 6336   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6337   ins_encode %{
 6338     assert(UseAVX > 0, "required");
 6339     int vlen_enc = vector_length_encoding(this);
 6340     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6341   %}
 6342   ins_pipe( pipe_slow );
 6343 %}
 6344 
 6345 // Floating point vector sqrt
 6346 instruct vsqrtD_reg(vec dst, vec src) %{
 6347   match(Set dst (SqrtVD src));
 6348   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6349   ins_encode %{
 6350     assert(UseAVX > 0, "required");
 6351     int vlen_enc = vector_length_encoding(this);
 6352     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6353   %}
 6354   ins_pipe( pipe_slow );
 6355 %}
 6356 
 6357 instruct vsqrtD_mem(vec dst, memory mem) %{
 6358   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6359   match(Set dst (SqrtVD (LoadVector mem)));
 6360   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6361   ins_encode %{
 6362     assert(UseAVX > 0, "required");
 6363     int vlen_enc = vector_length_encoding(this);
 6364     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6365   %}
 6366   ins_pipe( pipe_slow );
 6367 %}
 6368 
 6369 // ------------------------------ Shift ---------------------------------------
 6370 
 6371 // Left and right shift count vectors are the same on x86
 6372 // (only lowest bits of xmm reg are used for count).
 6373 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6374   match(Set dst (LShiftCntV cnt));
 6375   match(Set dst (RShiftCntV cnt));
 6376   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6377   ins_encode %{
 6378     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6379   %}
 6380   ins_pipe( pipe_slow );
 6381 %}
 6382 
 6383 // Byte vector shift
 6384 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6385   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6386   match(Set dst ( LShiftVB src shift));
 6387   match(Set dst ( RShiftVB src shift));
 6388   match(Set dst (URShiftVB src shift));
 6389   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6390   format %{"vector_byte_shift $dst,$src,$shift" %}
 6391   ins_encode %{
 6392     assert(UseSSE > 3, "required");
 6393     int opcode = this->ideal_Opcode();
 6394     bool sign = (opcode != Op_URShiftVB);
 6395     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6396     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6397     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6398     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6399     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6400   %}
 6401   ins_pipe( pipe_slow );
 6402 %}
 6403 
 6404 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6405   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6406             UseAVX <= 1);
 6407   match(Set dst ( LShiftVB src shift));
 6408   match(Set dst ( RShiftVB src shift));
 6409   match(Set dst (URShiftVB src shift));
 6410   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6411   format %{"vector_byte_shift $dst,$src,$shift" %}
 6412   ins_encode %{
 6413     assert(UseSSE > 3, "required");
 6414     int opcode = this->ideal_Opcode();
 6415     bool sign = (opcode != Op_URShiftVB);
 6416     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6417     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6418     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6419     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6420     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6421     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6422     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6423     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6424     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6425   %}
 6426   ins_pipe( pipe_slow );
 6427 %}
 6428 
 6429 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6430   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6431             UseAVX > 1);
 6432   match(Set dst ( LShiftVB src shift));
 6433   match(Set dst ( RShiftVB src shift));
 6434   match(Set dst (URShiftVB src shift));
 6435   effect(TEMP dst, TEMP tmp);
 6436   format %{"vector_byte_shift $dst,$src,$shift" %}
 6437   ins_encode %{
 6438     int opcode = this->ideal_Opcode();
 6439     bool sign = (opcode != Op_URShiftVB);
 6440     int vlen_enc = Assembler::AVX_256bit;
 6441     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6442     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6443     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6444     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6445     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6446   %}
 6447   ins_pipe( pipe_slow );
 6448 %}
 6449 
 6450 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6451   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6452   match(Set dst ( LShiftVB src shift));
 6453   match(Set dst ( RShiftVB src shift));
 6454   match(Set dst (URShiftVB src shift));
 6455   effect(TEMP dst, TEMP tmp);
 6456   format %{"vector_byte_shift $dst,$src,$shift" %}
 6457   ins_encode %{
 6458     assert(UseAVX > 1, "required");
 6459     int opcode = this->ideal_Opcode();
 6460     bool sign = (opcode != Op_URShiftVB);
 6461     int vlen_enc = Assembler::AVX_256bit;
 6462     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6463     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6464     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6465     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6466     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6467     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6468     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6469     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6470     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6471   %}
 6472   ins_pipe( pipe_slow );
 6473 %}
 6474 
 6475 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6476   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6477   match(Set dst ( LShiftVB src shift));
 6478   match(Set dst  (RShiftVB src shift));
 6479   match(Set dst (URShiftVB src shift));
 6480   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6481   format %{"vector_byte_shift $dst,$src,$shift" %}
 6482   ins_encode %{
 6483     assert(UseAVX > 2, "required");
 6484     int opcode = this->ideal_Opcode();
 6485     bool sign = (opcode != Op_URShiftVB);
 6486     int vlen_enc = Assembler::AVX_512bit;
 6487     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6488     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6489     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6490     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6491     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6492     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6493     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6494     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6495     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6496     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6497     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6498     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6499   %}
 6500   ins_pipe( pipe_slow );
 6501 %}
 6502 
 6503 // Shorts vector logical right shift produces incorrect Java result
 6504 // for negative data because java code convert short value into int with
 6505 // sign extension before a shift. But char vectors are fine since chars are
 6506 // unsigned values.
 6507 // Shorts/Chars vector left shift
 6508 instruct vshiftS(vec dst, vec src, vec shift) %{
 6509   predicate(!n->as_ShiftV()->is_var_shift());
 6510   match(Set dst ( LShiftVS src shift));
 6511   match(Set dst ( RShiftVS src shift));
 6512   match(Set dst (URShiftVS src shift));
 6513   effect(TEMP dst, USE src, USE shift);
 6514   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6515   ins_encode %{
 6516     int opcode = this->ideal_Opcode();
 6517     if (UseAVX > 0) {
 6518       int vlen_enc = vector_length_encoding(this);
 6519       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6520     } else {
 6521       int vlen = Matcher::vector_length(this);
 6522       if (vlen == 2) {
 6523         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6524         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6525       } else if (vlen == 4) {
 6526         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6527         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6528       } else {
 6529         assert (vlen == 8, "sanity");
 6530         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6531         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6532       }
 6533     }
 6534   %}
 6535   ins_pipe( pipe_slow );
 6536 %}
 6537 
 6538 // Integers vector left shift
 6539 instruct vshiftI(vec dst, vec src, vec shift) %{
 6540   predicate(!n->as_ShiftV()->is_var_shift());
 6541   match(Set dst ( LShiftVI src shift));
 6542   match(Set dst ( RShiftVI src shift));
 6543   match(Set dst (URShiftVI src shift));
 6544   effect(TEMP dst, USE src, USE shift);
 6545   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6546   ins_encode %{
 6547     int opcode = this->ideal_Opcode();
 6548     if (UseAVX > 0) {
 6549       int vlen_enc = vector_length_encoding(this);
 6550       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6551     } else {
 6552       int vlen = Matcher::vector_length(this);
 6553       if (vlen == 2) {
 6554         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6555         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6556       } else {
 6557         assert(vlen == 4, "sanity");
 6558         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6559         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6560       }
 6561     }
 6562   %}
 6563   ins_pipe( pipe_slow );
 6564 %}
 6565 
 6566 // Integers vector left constant shift
 6567 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6568   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6569   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6570   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6571   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6572   ins_encode %{
 6573     int opcode = this->ideal_Opcode();
 6574     if (UseAVX > 0) {
 6575       int vector_len = vector_length_encoding(this);
 6576       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6577     } else {
 6578       int vlen = Matcher::vector_length(this);
 6579       if (vlen == 2) {
 6580         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6581         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6582       } else {
 6583         assert(vlen == 4, "sanity");
 6584         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6585         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6586       }
 6587     }
 6588   %}
 6589   ins_pipe( pipe_slow );
 6590 %}
 6591 
 6592 // Longs vector shift
 6593 instruct vshiftL(vec dst, vec src, vec shift) %{
 6594   predicate(!n->as_ShiftV()->is_var_shift());
 6595   match(Set dst ( LShiftVL src shift));
 6596   match(Set dst (URShiftVL src shift));
 6597   effect(TEMP dst, USE src, USE shift);
 6598   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6599   ins_encode %{
 6600     int opcode = this->ideal_Opcode();
 6601     if (UseAVX > 0) {
 6602       int vlen_enc = vector_length_encoding(this);
 6603       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6604     } else {
 6605       assert(Matcher::vector_length(this) == 2, "");
 6606       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6607       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6608     }
 6609   %}
 6610   ins_pipe( pipe_slow );
 6611 %}
 6612 
 6613 // Longs vector constant shift
 6614 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6615   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6616   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6617   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6618   ins_encode %{
 6619     int opcode = this->ideal_Opcode();
 6620     if (UseAVX > 0) {
 6621       int vector_len = vector_length_encoding(this);
 6622       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6623     } else {
 6624       assert(Matcher::vector_length(this) == 2, "");
 6625       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6626       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6627     }
 6628   %}
 6629   ins_pipe( pipe_slow );
 6630 %}
 6631 
 6632 // -------------------ArithmeticRightShift -----------------------------------
 6633 // Long vector arithmetic right shift
 6634 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6635   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6636   match(Set dst (RShiftVL src shift));
 6637   effect(TEMP dst, TEMP tmp);
 6638   format %{ "vshiftq $dst,$src,$shift" %}
 6639   ins_encode %{
 6640     uint vlen = Matcher::vector_length(this);
 6641     if (vlen == 2) {
 6642       assert(UseSSE >= 2, "required");
 6643       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6644       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6645       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6646       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6647       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6648       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6649     } else {
 6650       assert(vlen == 4, "sanity");
 6651       assert(UseAVX > 1, "required");
 6652       int vlen_enc = Assembler::AVX_256bit;
 6653       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6654       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6655       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6656       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6657       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6658     }
 6659   %}
 6660   ins_pipe( pipe_slow );
 6661 %}
 6662 
 6663 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6664   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6665   match(Set dst (RShiftVL src shift));
 6666   format %{ "vshiftq $dst,$src,$shift" %}
 6667   ins_encode %{
 6668     int vlen_enc = vector_length_encoding(this);
 6669     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6670   %}
 6671   ins_pipe( pipe_slow );
 6672 %}
 6673 
 6674 // ------------------- Variable Shift -----------------------------
 6675 // Byte variable shift
 6676 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6677   predicate(Matcher::vector_length(n) <= 8 &&
 6678             n->as_ShiftV()->is_var_shift() &&
 6679             !VM_Version::supports_avx512bw());
 6680   match(Set dst ( LShiftVB src shift));
 6681   match(Set dst ( RShiftVB src shift));
 6682   match(Set dst (URShiftVB src shift));
 6683   effect(TEMP dst, TEMP vtmp);
 6684   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6685   ins_encode %{
 6686     assert(UseAVX >= 2, "required");
 6687 
 6688     int opcode = this->ideal_Opcode();
 6689     int vlen_enc = Assembler::AVX_128bit;
 6690     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6691     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6692   %}
 6693   ins_pipe( pipe_slow );
 6694 %}
 6695 
 6696 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6697   predicate(Matcher::vector_length(n) == 16 &&
 6698             n->as_ShiftV()->is_var_shift() &&
 6699             !VM_Version::supports_avx512bw());
 6700   match(Set dst ( LShiftVB src shift));
 6701   match(Set dst ( RShiftVB src shift));
 6702   match(Set dst (URShiftVB src shift));
 6703   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6704   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6705   ins_encode %{
 6706     assert(UseAVX >= 2, "required");
 6707 
 6708     int opcode = this->ideal_Opcode();
 6709     int vlen_enc = Assembler::AVX_128bit;
 6710     // Shift lower half and get word result in dst
 6711     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6712 
 6713     // Shift upper half and get word result in vtmp1
 6714     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6715     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6716     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6717 
 6718     // Merge and down convert the two word results to byte in dst
 6719     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6720   %}
 6721   ins_pipe( pipe_slow );
 6722 %}
 6723 
 6724 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6725   predicate(Matcher::vector_length(n) == 32 &&
 6726             n->as_ShiftV()->is_var_shift() &&
 6727             !VM_Version::supports_avx512bw());
 6728   match(Set dst ( LShiftVB src shift));
 6729   match(Set dst ( RShiftVB src shift));
 6730   match(Set dst (URShiftVB src shift));
 6731   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6732   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6733   ins_encode %{
 6734     assert(UseAVX >= 2, "required");
 6735 
 6736     int opcode = this->ideal_Opcode();
 6737     int vlen_enc = Assembler::AVX_128bit;
 6738     // Process lower 128 bits and get result in dst
 6739     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6740     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6741     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6742     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6743     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6744 
 6745     // Process higher 128 bits and get result in vtmp3
 6746     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6747     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6748     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6749     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6750     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6751     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6752     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6753 
 6754     // Merge the two results in dst
 6755     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6756   %}
 6757   ins_pipe( pipe_slow );
 6758 %}
 6759 
 6760 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6761   predicate(Matcher::vector_length(n) <= 32 &&
 6762             n->as_ShiftV()->is_var_shift() &&
 6763             VM_Version::supports_avx512bw());
 6764   match(Set dst ( LShiftVB src shift));
 6765   match(Set dst ( RShiftVB src shift));
 6766   match(Set dst (URShiftVB src shift));
 6767   effect(TEMP dst, TEMP vtmp);
 6768   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6769   ins_encode %{
 6770     assert(UseAVX > 2, "required");
 6771 
 6772     int opcode = this->ideal_Opcode();
 6773     int vlen_enc = vector_length_encoding(this);
 6774     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6775   %}
 6776   ins_pipe( pipe_slow );
 6777 %}
 6778 
 6779 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6780   predicate(Matcher::vector_length(n) == 64 &&
 6781             n->as_ShiftV()->is_var_shift() &&
 6782             VM_Version::supports_avx512bw());
 6783   match(Set dst ( LShiftVB src shift));
 6784   match(Set dst ( RShiftVB src shift));
 6785   match(Set dst (URShiftVB src shift));
 6786   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6787   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6788   ins_encode %{
 6789     assert(UseAVX > 2, "required");
 6790 
 6791     int opcode = this->ideal_Opcode();
 6792     int vlen_enc = Assembler::AVX_256bit;
 6793     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6794     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6795     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6796     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6797     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6798   %}
 6799   ins_pipe( pipe_slow );
 6800 %}
 6801 
 6802 // Short variable shift
 6803 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6804   predicate(Matcher::vector_length(n) <= 8 &&
 6805             n->as_ShiftV()->is_var_shift() &&
 6806             !VM_Version::supports_avx512bw());
 6807   match(Set dst ( LShiftVS src shift));
 6808   match(Set dst ( RShiftVS src shift));
 6809   match(Set dst (URShiftVS src shift));
 6810   effect(TEMP dst, TEMP vtmp);
 6811   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6812   ins_encode %{
 6813     assert(UseAVX >= 2, "required");
 6814 
 6815     int opcode = this->ideal_Opcode();
 6816     bool sign = (opcode != Op_URShiftVS);
 6817     int vlen_enc = Assembler::AVX_256bit;
 6818     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6819     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6820     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6821     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6822     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6823     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6824   %}
 6825   ins_pipe( pipe_slow );
 6826 %}
 6827 
 6828 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6829   predicate(Matcher::vector_length(n) == 16 &&
 6830             n->as_ShiftV()->is_var_shift() &&
 6831             !VM_Version::supports_avx512bw());
 6832   match(Set dst ( LShiftVS src shift));
 6833   match(Set dst ( RShiftVS src shift));
 6834   match(Set dst (URShiftVS src shift));
 6835   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6836   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6837   ins_encode %{
 6838     assert(UseAVX >= 2, "required");
 6839 
 6840     int opcode = this->ideal_Opcode();
 6841     bool sign = (opcode != Op_URShiftVS);
 6842     int vlen_enc = Assembler::AVX_256bit;
 6843     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6844     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6845     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6846     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6847     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6848 
 6849     // Shift upper half, with result in dst using vtmp1 as TEMP
 6850     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6851     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6852     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6853     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6854     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6855     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6856 
 6857     // Merge lower and upper half result into dst
 6858     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6859     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6860   %}
 6861   ins_pipe( pipe_slow );
 6862 %}
 6863 
 6864 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6865   predicate(n->as_ShiftV()->is_var_shift() &&
 6866             VM_Version::supports_avx512bw());
 6867   match(Set dst ( LShiftVS src shift));
 6868   match(Set dst ( RShiftVS src shift));
 6869   match(Set dst (URShiftVS src shift));
 6870   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6871   ins_encode %{
 6872     assert(UseAVX > 2, "required");
 6873 
 6874     int opcode = this->ideal_Opcode();
 6875     int vlen_enc = vector_length_encoding(this);
 6876     if (!VM_Version::supports_avx512vl()) {
 6877       vlen_enc = Assembler::AVX_512bit;
 6878     }
 6879     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6880   %}
 6881   ins_pipe( pipe_slow );
 6882 %}
 6883 
 6884 //Integer variable shift
 6885 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6886   predicate(n->as_ShiftV()->is_var_shift());
 6887   match(Set dst ( LShiftVI src shift));
 6888   match(Set dst ( RShiftVI src shift));
 6889   match(Set dst (URShiftVI src shift));
 6890   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6891   ins_encode %{
 6892     assert(UseAVX >= 2, "required");
 6893 
 6894     int opcode = this->ideal_Opcode();
 6895     int vlen_enc = vector_length_encoding(this);
 6896     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6897   %}
 6898   ins_pipe( pipe_slow );
 6899 %}
 6900 
 6901 //Long variable shift
 6902 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6903   predicate(n->as_ShiftV()->is_var_shift());
 6904   match(Set dst ( LShiftVL src shift));
 6905   match(Set dst (URShiftVL src shift));
 6906   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6907   ins_encode %{
 6908     assert(UseAVX >= 2, "required");
 6909 
 6910     int opcode = this->ideal_Opcode();
 6911     int vlen_enc = vector_length_encoding(this);
 6912     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6913   %}
 6914   ins_pipe( pipe_slow );
 6915 %}
 6916 
 6917 //Long variable right shift arithmetic
 6918 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6919   predicate(Matcher::vector_length(n) <= 4 &&
 6920             n->as_ShiftV()->is_var_shift() &&
 6921             UseAVX == 2);
 6922   match(Set dst (RShiftVL src shift));
 6923   effect(TEMP dst, TEMP vtmp);
 6924   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6925   ins_encode %{
 6926     int opcode = this->ideal_Opcode();
 6927     int vlen_enc = vector_length_encoding(this);
 6928     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 6929                  $vtmp$$XMMRegister);
 6930   %}
 6931   ins_pipe( pipe_slow );
 6932 %}
 6933 
 6934 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 6935   predicate(n->as_ShiftV()->is_var_shift() &&
 6936             UseAVX > 2);
 6937   match(Set dst (RShiftVL src shift));
 6938   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 6939   ins_encode %{
 6940     int opcode = this->ideal_Opcode();
 6941     int vlen_enc = vector_length_encoding(this);
 6942     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6943   %}
 6944   ins_pipe( pipe_slow );
 6945 %}
 6946 
 6947 // --------------------------------- AND --------------------------------------
 6948 
 6949 instruct vand(vec dst, vec src) %{
 6950   predicate(UseAVX == 0);
 6951   match(Set dst (AndV dst src));
 6952   format %{ "pand    $dst,$src\t! and vectors" %}
 6953   ins_encode %{
 6954     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 6955   %}
 6956   ins_pipe( pipe_slow );
 6957 %}
 6958 
 6959 instruct vand_reg(vec dst, vec src1, vec src2) %{
 6960   predicate(UseAVX > 0);
 6961   match(Set dst (AndV src1 src2));
 6962   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 6963   ins_encode %{
 6964     int vlen_enc = vector_length_encoding(this);
 6965     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6966   %}
 6967   ins_pipe( pipe_slow );
 6968 %}
 6969 
 6970 instruct vand_mem(vec dst, vec src, memory mem) %{
 6971   predicate((UseAVX > 0) &&
 6972             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6973   match(Set dst (AndV src (LoadVector mem)));
 6974   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 6975   ins_encode %{
 6976     int vlen_enc = vector_length_encoding(this);
 6977     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6978   %}
 6979   ins_pipe( pipe_slow );
 6980 %}
 6981 
 6982 // --------------------------------- OR ---------------------------------------
 6983 
 6984 instruct vor(vec dst, vec src) %{
 6985   predicate(UseAVX == 0);
 6986   match(Set dst (OrV dst src));
 6987   format %{ "por     $dst,$src\t! or vectors" %}
 6988   ins_encode %{
 6989     __ por($dst$$XMMRegister, $src$$XMMRegister);
 6990   %}
 6991   ins_pipe( pipe_slow );
 6992 %}
 6993 
 6994 instruct vor_reg(vec dst, vec src1, vec src2) %{
 6995   predicate(UseAVX > 0);
 6996   match(Set dst (OrV src1 src2));
 6997   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 6998   ins_encode %{
 6999     int vlen_enc = vector_length_encoding(this);
 7000     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7001   %}
 7002   ins_pipe( pipe_slow );
 7003 %}
 7004 
 7005 instruct vor_mem(vec dst, vec src, memory mem) %{
 7006   predicate((UseAVX > 0) &&
 7007             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7008   match(Set dst (OrV src (LoadVector mem)));
 7009   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7010   ins_encode %{
 7011     int vlen_enc = vector_length_encoding(this);
 7012     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7013   %}
 7014   ins_pipe( pipe_slow );
 7015 %}
 7016 
 7017 // --------------------------------- XOR --------------------------------------
 7018 
 7019 instruct vxor(vec dst, vec src) %{
 7020   predicate(UseAVX == 0);
 7021   match(Set dst (XorV dst src));
 7022   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7023   ins_encode %{
 7024     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7025   %}
 7026   ins_pipe( pipe_slow );
 7027 %}
 7028 
 7029 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7030   predicate(UseAVX > 0);
 7031   match(Set dst (XorV src1 src2));
 7032   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7033   ins_encode %{
 7034     int vlen_enc = vector_length_encoding(this);
 7035     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7036   %}
 7037   ins_pipe( pipe_slow );
 7038 %}
 7039 
 7040 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7041   predicate((UseAVX > 0) &&
 7042             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7043   match(Set dst (XorV src (LoadVector mem)));
 7044   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7045   ins_encode %{
 7046     int vlen_enc = vector_length_encoding(this);
 7047     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7048   %}
 7049   ins_pipe( pipe_slow );
 7050 %}
 7051 
 7052 // --------------------------------- VectorCast --------------------------------------
 7053 
 7054 instruct vcastBtoX(vec dst, vec src) %{
 7055   match(Set dst (VectorCastB2X src));
 7056   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7057   ins_encode %{
 7058     assert(UseAVX > 0, "required");
 7059 
 7060     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7061     int vlen_enc = vector_length_encoding(this);
 7062     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7063   %}
 7064   ins_pipe( pipe_slow );
 7065 %}
 7066 
 7067 instruct castStoX(vec dst, vec src) %{
 7068   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7069             Matcher::vector_length(n->in(1)) <= 8 && // src
 7070             Matcher::vector_element_basic_type(n) == T_BYTE);
 7071   match(Set dst (VectorCastS2X src));
 7072   format %{ "vector_cast_s2x $dst,$src" %}
 7073   ins_encode %{
 7074     assert(UseAVX > 0, "required");
 7075 
 7076     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7077     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7078   %}
 7079   ins_pipe( pipe_slow );
 7080 %}
 7081 
 7082 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7083   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7084             Matcher::vector_length(n->in(1)) == 16 && // src
 7085             Matcher::vector_element_basic_type(n) == T_BYTE);
 7086   effect(TEMP dst, TEMP vtmp);
 7087   match(Set dst (VectorCastS2X src));
 7088   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7089   ins_encode %{
 7090     assert(UseAVX > 0, "required");
 7091 
 7092     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7093     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7094     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7095     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7096   %}
 7097   ins_pipe( pipe_slow );
 7098 %}
 7099 
 7100 instruct vcastStoX_evex(vec dst, vec src) %{
 7101   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7102             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7103   match(Set dst (VectorCastS2X src));
 7104   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7105   ins_encode %{
 7106     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7107     int src_vlen_enc = vector_length_encoding(this, $src);
 7108     int vlen_enc = vector_length_encoding(this);
 7109     switch (to_elem_bt) {
 7110       case T_BYTE:
 7111         if (!VM_Version::supports_avx512vl()) {
 7112           vlen_enc = Assembler::AVX_512bit;
 7113         }
 7114         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7115         break;
 7116       case T_INT:
 7117         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7118         break;
 7119       case T_FLOAT:
 7120         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7121         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7122         break;
 7123       case T_LONG:
 7124         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7125         break;
 7126       case T_DOUBLE: {
 7127         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7128         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7129         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7130         break;
 7131       }
 7132       default:
 7133         ShouldNotReachHere();
 7134     }
 7135   %}
 7136   ins_pipe( pipe_slow );
 7137 %}
 7138 
 7139 instruct castItoX(vec dst, vec src) %{
 7140   predicate(UseAVX <= 2 &&
 7141             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7142             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7143   match(Set dst (VectorCastI2X src));
 7144   format %{ "vector_cast_i2x $dst,$src" %}
 7145   ins_encode %{
 7146     assert(UseAVX > 0, "required");
 7147 
 7148     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7149     int vlen_enc = vector_length_encoding(this, $src);
 7150 
 7151     if (to_elem_bt == T_BYTE) {
 7152       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7153       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7154       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7155     } else {
 7156       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7157       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7158       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7159     }
 7160   %}
 7161   ins_pipe( pipe_slow );
 7162 %}
 7163 
 7164 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7165   predicate(UseAVX <= 2 &&
 7166             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7167             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7168   match(Set dst (VectorCastI2X src));
 7169   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7170   effect(TEMP dst, TEMP vtmp);
 7171   ins_encode %{
 7172     assert(UseAVX > 0, "required");
 7173 
 7174     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7175     int vlen_enc = vector_length_encoding(this, $src);
 7176 
 7177     if (to_elem_bt == T_BYTE) {
 7178       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7179       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7180       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7181       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7182     } else {
 7183       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7184       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7185       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7186       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7187     }
 7188   %}
 7189   ins_pipe( pipe_slow );
 7190 %}
 7191 
 7192 instruct vcastItoX_evex(vec dst, vec src) %{
 7193   predicate(UseAVX > 2 ||
 7194             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7195   match(Set dst (VectorCastI2X src));
 7196   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7197   ins_encode %{
 7198     assert(UseAVX > 0, "required");
 7199 
 7200     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7201     int src_vlen_enc = vector_length_encoding(this, $src);
 7202     int dst_vlen_enc = vector_length_encoding(this);
 7203     switch (dst_elem_bt) {
 7204       case T_BYTE:
 7205         if (!VM_Version::supports_avx512vl()) {
 7206           src_vlen_enc = Assembler::AVX_512bit;
 7207         }
 7208         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7209         break;
 7210       case T_SHORT:
 7211         if (!VM_Version::supports_avx512vl()) {
 7212           src_vlen_enc = Assembler::AVX_512bit;
 7213         }
 7214         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7215         break;
 7216       case T_FLOAT:
 7217         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7218         break;
 7219       case T_LONG:
 7220         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7221         break;
 7222       case T_DOUBLE:
 7223         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7224         break;
 7225       default:
 7226         ShouldNotReachHere();
 7227     }
 7228   %}
 7229   ins_pipe( pipe_slow );
 7230 %}
 7231 
 7232 instruct vcastLtoBS(vec dst, vec src) %{
 7233   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7234             UseAVX <= 2);
 7235   match(Set dst (VectorCastL2X src));
 7236   format %{ "vector_cast_l2x  $dst,$src" %}
 7237   ins_encode %{
 7238     assert(UseAVX > 0, "required");
 7239 
 7240     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7241     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7242     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7243                                                       : ExternalAddress(vector_int_to_short_mask());
 7244     if (vlen <= 16) {
 7245       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7246       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7247       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7248     } else {
 7249       assert(vlen <= 32, "required");
 7250       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7251       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7252       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7253       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7254     }
 7255     if (to_elem_bt == T_BYTE) {
 7256       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7257     }
 7258   %}
 7259   ins_pipe( pipe_slow );
 7260 %}
 7261 
 7262 instruct vcastLtoX_evex(vec dst, vec src) %{
 7263   predicate(UseAVX > 2 ||
 7264             (Matcher::vector_element_basic_type(n) == T_INT ||
 7265              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7266              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7267   match(Set dst (VectorCastL2X src));
 7268   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7269   ins_encode %{
 7270     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7271     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7272     int vlen_enc = vector_length_encoding(this, $src);
 7273     switch (to_elem_bt) {
 7274       case T_BYTE:
 7275         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7276           vlen_enc = Assembler::AVX_512bit;
 7277         }
 7278         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7279         break;
 7280       case T_SHORT:
 7281         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7282           vlen_enc = Assembler::AVX_512bit;
 7283         }
 7284         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7285         break;
 7286       case T_INT:
 7287         if (vlen == 8) {
 7288           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7289             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7290           }
 7291         } else if (vlen == 16) {
 7292           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7293         } else if (vlen == 32) {
 7294           if (UseAVX > 2) {
 7295             if (!VM_Version::supports_avx512vl()) {
 7296               vlen_enc = Assembler::AVX_512bit;
 7297             }
 7298             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7299           } else {
 7300             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7301             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7302           }
 7303         } else { // vlen == 64
 7304           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7305         }
 7306         break;
 7307       case T_FLOAT:
 7308         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7309         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7310         break;
 7311       case T_DOUBLE:
 7312         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7313         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7314         break;
 7315 
 7316       default: assert(false, "%s", type2name(to_elem_bt));
 7317     }
 7318   %}
 7319   ins_pipe( pipe_slow );
 7320 %}
 7321 
 7322 instruct vcastFtoD_reg(vec dst, vec src) %{
 7323   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7324   match(Set dst (VectorCastF2X src));
 7325   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7326   ins_encode %{
 7327     int vlen_enc = vector_length_encoding(this);
 7328     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7329   %}
 7330   ins_pipe( pipe_slow );
 7331 %}
 7332 
 7333 
 7334 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7335   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7336             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7337   match(Set dst (VectorCastF2X src));
 7338   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7339   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7340   ins_encode %{
 7341     int vlen_enc = vector_length_encoding(this, $src);
 7342     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7343     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7344     // 32 bit addresses for register indirect addressing mode since stub constants
 7345     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7346     // However, targets are free to increase this limit, but having a large code cache size
 7347     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7348     // cap we save a temporary register allocation which in limiting case can prevent
 7349     // spilling in high register pressure blocks.
 7350     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7351                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7352                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7353   %}
 7354   ins_pipe( pipe_slow );
 7355 %}
 7356 
 7357 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7358   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7359             is_integral_type(Matcher::vector_element_basic_type(n)));
 7360   match(Set dst (VectorCastF2X src));
 7361   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7362   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7363   ins_encode %{
 7364     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7365     if (to_elem_bt == T_LONG) {
 7366       int vlen_enc = vector_length_encoding(this);
 7367       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7368                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7369                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7370     } else {
 7371       int vlen_enc = vector_length_encoding(this, $src);
 7372       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7373                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7374                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7375     }
 7376   %}
 7377   ins_pipe( pipe_slow );
 7378 %}
 7379 
 7380 instruct vcastDtoF_reg(vec dst, vec src) %{
 7381   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7382   match(Set dst (VectorCastD2X src));
 7383   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7384   ins_encode %{
 7385     int vlen_enc = vector_length_encoding(this, $src);
 7386     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7387   %}
 7388   ins_pipe( pipe_slow );
 7389 %}
 7390 
 7391 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7392   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7393             is_integral_type(Matcher::vector_element_basic_type(n)));
 7394   match(Set dst (VectorCastD2X src));
 7395   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7396   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7397   ins_encode %{
 7398     int vlen_enc = vector_length_encoding(this, $src);
 7399     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7400     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7401                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7402                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7403   %}
 7404   ins_pipe( pipe_slow );
 7405 %}
 7406 
 7407 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7408   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7409             is_integral_type(Matcher::vector_element_basic_type(n)));
 7410   match(Set dst (VectorCastD2X src));
 7411   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7412   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7413   ins_encode %{
 7414     int vlen_enc = vector_length_encoding(this, $src);
 7415     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7416     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7417                               ExternalAddress(vector_float_signflip());
 7418     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7419                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7420   %}
 7421   ins_pipe( pipe_slow );
 7422 %}
 7423 
 7424 instruct vucast(vec dst, vec src) %{
 7425   match(Set dst (VectorUCastB2X src));
 7426   match(Set dst (VectorUCastS2X src));
 7427   match(Set dst (VectorUCastI2X src));
 7428   format %{ "vector_ucast $dst,$src\t!" %}
 7429   ins_encode %{
 7430     assert(UseAVX > 0, "required");
 7431 
 7432     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7433     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7434     int vlen_enc = vector_length_encoding(this);
 7435     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7436   %}
 7437   ins_pipe( pipe_slow );
 7438 %}
 7439 
 7440 #ifdef _LP64
 7441 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7442   predicate(!VM_Version::supports_avx512vl() &&
 7443             Matcher::vector_length_in_bytes(n) < 64 &&
 7444             Matcher::vector_element_basic_type(n) == T_INT);
 7445   match(Set dst (RoundVF src));
 7446   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7447   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7448   ins_encode %{
 7449     int vlen_enc = vector_length_encoding(this);
 7450     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7451     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7452                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7453                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7454   %}
 7455   ins_pipe( pipe_slow );
 7456 %}
 7457 
 7458 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7459   predicate((VM_Version::supports_avx512vl() ||
 7460              Matcher::vector_length_in_bytes(n) == 64) &&
 7461              Matcher::vector_element_basic_type(n) == T_INT);
 7462   match(Set dst (RoundVF src));
 7463   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7464   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7465   ins_encode %{
 7466     int vlen_enc = vector_length_encoding(this);
 7467     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7468     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7469                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7470                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7471   %}
 7472   ins_pipe( pipe_slow );
 7473 %}
 7474 
 7475 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7476   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7477   match(Set dst (RoundVD src));
 7478   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7479   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7480   ins_encode %{
 7481     int vlen_enc = vector_length_encoding(this);
 7482     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7483     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7484                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7485                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7486   %}
 7487   ins_pipe( pipe_slow );
 7488 %}
 7489 
 7490 #endif // _LP64
 7491 
 7492 // --------------------------------- VectorMaskCmp --------------------------------------
 7493 
 7494 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7495   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7496             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7497             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7498             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7499   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7500   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7501   ins_encode %{
 7502     int vlen_enc = vector_length_encoding(this, $src1);
 7503     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7504     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7505       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7506     } else {
 7507       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7508     }
 7509   %}
 7510   ins_pipe( pipe_slow );
 7511 %}
 7512 
 7513 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7514   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7515             n->bottom_type()->isa_vectmask() == nullptr &&
 7516             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7517   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7518   effect(TEMP ktmp);
 7519   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7520   ins_encode %{
 7521     int vlen_enc = Assembler::AVX_512bit;
 7522     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7523     KRegister mask = k0; // The comparison itself is not being masked.
 7524     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7525       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7526       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7527     } else {
 7528       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7529       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7530     }
 7531   %}
 7532   ins_pipe( pipe_slow );
 7533 %}
 7534 
 7535 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7536   predicate(n->bottom_type()->isa_vectmask() &&
 7537             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7538   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7539   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7540   ins_encode %{
 7541     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7542     int vlen_enc = vector_length_encoding(this, $src1);
 7543     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7544     KRegister mask = k0; // The comparison itself is not being masked.
 7545     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7546       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7547     } else {
 7548       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7549     }
 7550   %}
 7551   ins_pipe( pipe_slow );
 7552 %}
 7553 
 7554 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7555   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7556             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7557             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7558             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7559             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7560             (n->in(2)->get_int() == BoolTest::eq ||
 7561              n->in(2)->get_int() == BoolTest::lt ||
 7562              n->in(2)->get_int() == BoolTest::gt)); // cond
 7563   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7564   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7565   ins_encode %{
 7566     int vlen_enc = vector_length_encoding(this, $src1);
 7567     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7568     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7569     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7570   %}
 7571   ins_pipe( pipe_slow );
 7572 %}
 7573 
 7574 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7575   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7576             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7577             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7578             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7579             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7580             (n->in(2)->get_int() == BoolTest::ne ||
 7581              n->in(2)->get_int() == BoolTest::le ||
 7582              n->in(2)->get_int() == BoolTest::ge)); // cond
 7583   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7584   effect(TEMP dst, TEMP xtmp);
 7585   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7586   ins_encode %{
 7587     int vlen_enc = vector_length_encoding(this, $src1);
 7588     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7589     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7590     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7591   %}
 7592   ins_pipe( pipe_slow );
 7593 %}
 7594 
 7595 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7596   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7597             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7598             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7599             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7600             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7601   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7602   effect(TEMP dst, TEMP xtmp);
 7603   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7604   ins_encode %{
 7605     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7606     int vlen_enc = vector_length_encoding(this, $src1);
 7607     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7608     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7609 
 7610     if (vlen_enc == Assembler::AVX_128bit) {
 7611       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7612     } else {
 7613       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7614     }
 7615     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7616     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7617     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7618   %}
 7619   ins_pipe( pipe_slow );
 7620 %}
 7621 
 7622 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7623   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7624              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7625              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7626   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7627   effect(TEMP ktmp);
 7628   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7629   ins_encode %{
 7630     assert(UseAVX > 2, "required");
 7631 
 7632     int vlen_enc = vector_length_encoding(this, $src1);
 7633     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7634     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7635     KRegister mask = k0; // The comparison itself is not being masked.
 7636     bool merge = false;
 7637     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7638 
 7639     switch (src1_elem_bt) {
 7640       case T_INT: {
 7641         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7642         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7643         break;
 7644       }
 7645       case T_LONG: {
 7646         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7647         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7648         break;
 7649       }
 7650       default: assert(false, "%s", type2name(src1_elem_bt));
 7651     }
 7652   %}
 7653   ins_pipe( pipe_slow );
 7654 %}
 7655 
 7656 
 7657 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7658   predicate(n->bottom_type()->isa_vectmask() &&
 7659             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7660   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7661   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7662   ins_encode %{
 7663     assert(UseAVX > 2, "required");
 7664     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7665 
 7666     int vlen_enc = vector_length_encoding(this, $src1);
 7667     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7668     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7669     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7670 
 7671     // Comparison i
 7672     switch (src1_elem_bt) {
 7673       case T_BYTE: {
 7674         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7675         break;
 7676       }
 7677       case T_SHORT: {
 7678         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7679         break;
 7680       }
 7681       case T_INT: {
 7682         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7683         break;
 7684       }
 7685       case T_LONG: {
 7686         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7687         break;
 7688       }
 7689       default: assert(false, "%s", type2name(src1_elem_bt));
 7690     }
 7691   %}
 7692   ins_pipe( pipe_slow );
 7693 %}
 7694 
 7695 // Extract
 7696 
 7697 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7698   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7699   match(Set dst (ExtractI src idx));
 7700   match(Set dst (ExtractS src idx));
 7701 #ifdef _LP64
 7702   match(Set dst (ExtractB src idx));
 7703 #endif
 7704   format %{ "extractI $dst,$src,$idx\t!" %}
 7705   ins_encode %{
 7706     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7707 
 7708     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7709     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7710   %}
 7711   ins_pipe( pipe_slow );
 7712 %}
 7713 
 7714 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7715   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7716             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7717   match(Set dst (ExtractI src idx));
 7718   match(Set dst (ExtractS src idx));
 7719 #ifdef _LP64
 7720   match(Set dst (ExtractB src idx));
 7721 #endif
 7722   effect(TEMP vtmp);
 7723   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7724   ins_encode %{
 7725     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7726 
 7727     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7728     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7729     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7730   %}
 7731   ins_pipe( pipe_slow );
 7732 %}
 7733 
 7734 #ifdef _LP64
 7735 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7736   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7737   match(Set dst (ExtractL src idx));
 7738   format %{ "extractL $dst,$src,$idx\t!" %}
 7739   ins_encode %{
 7740     assert(UseSSE >= 4, "required");
 7741     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7742 
 7743     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7744   %}
 7745   ins_pipe( pipe_slow );
 7746 %}
 7747 
 7748 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7749   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7750             Matcher::vector_length(n->in(1)) == 8);  // src
 7751   match(Set dst (ExtractL src idx));
 7752   effect(TEMP vtmp);
 7753   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7754   ins_encode %{
 7755     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7756 
 7757     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7758     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7759   %}
 7760   ins_pipe( pipe_slow );
 7761 %}
 7762 #endif
 7763 
 7764 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7765   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7766   match(Set dst (ExtractF src idx));
 7767   effect(TEMP dst, TEMP vtmp);
 7768   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7769   ins_encode %{
 7770     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7771 
 7772     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7773   %}
 7774   ins_pipe( pipe_slow );
 7775 %}
 7776 
 7777 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7778   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7779             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7780   match(Set dst (ExtractF src idx));
 7781   effect(TEMP vtmp);
 7782   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7783   ins_encode %{
 7784     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7785 
 7786     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7787     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7788   %}
 7789   ins_pipe( pipe_slow );
 7790 %}
 7791 
 7792 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7793   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7794   match(Set dst (ExtractD src idx));
 7795   format %{ "extractD $dst,$src,$idx\t!" %}
 7796   ins_encode %{
 7797     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7798 
 7799     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7800   %}
 7801   ins_pipe( pipe_slow );
 7802 %}
 7803 
 7804 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7805   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7806             Matcher::vector_length(n->in(1)) == 8);  // src
 7807   match(Set dst (ExtractD src idx));
 7808   effect(TEMP vtmp);
 7809   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7810   ins_encode %{
 7811     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7812 
 7813     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7814     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7815   %}
 7816   ins_pipe( pipe_slow );
 7817 %}
 7818 
 7819 // --------------------------------- Vector Blend --------------------------------------
 7820 
 7821 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7822   predicate(UseAVX == 0);
 7823   match(Set dst (VectorBlend (Binary dst src) mask));
 7824   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7825   effect(TEMP tmp);
 7826   ins_encode %{
 7827     assert(UseSSE >= 4, "required");
 7828 
 7829     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7830       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7831     }
 7832     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7833   %}
 7834   ins_pipe( pipe_slow );
 7835 %}
 7836 
 7837 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7838   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 7839             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7840             Matcher::vector_length_in_bytes(n) <= 32 &&
 7841             is_integral_type(Matcher::vector_element_basic_type(n)));
 7842   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7843   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7844   ins_encode %{
 7845     int vlen_enc = vector_length_encoding(this);
 7846     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7847   %}
 7848   ins_pipe( pipe_slow );
 7849 %}
 7850 
 7851 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7852   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 7853             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7854             Matcher::vector_length_in_bytes(n) <= 32 &&
 7855             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7856   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7857   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7858   ins_encode %{
 7859     int vlen_enc = vector_length_encoding(this);
 7860     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7861   %}
 7862   ins_pipe( pipe_slow );
 7863 %}
 7864 
 7865 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 7866   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 7867             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7868             Matcher::vector_length_in_bytes(n) <= 32);
 7869   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7870   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 7871   effect(TEMP vtmp, TEMP dst);
 7872   ins_encode %{
 7873     int vlen_enc = vector_length_encoding(this);
 7874     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7875     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7876     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 7877   %}
 7878   ins_pipe( pipe_slow );
 7879 %}
 7880 
 7881 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7882   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7883             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 7884   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7885   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7886   effect(TEMP ktmp);
 7887   ins_encode %{
 7888      int vlen_enc = Assembler::AVX_512bit;
 7889      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7890     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7891     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7892   %}
 7893   ins_pipe( pipe_slow );
 7894 %}
 7895 
 7896 
 7897 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7898   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7899             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7900              VM_Version::supports_avx512bw()));
 7901   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7902   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7903   ins_encode %{
 7904     int vlen_enc = vector_length_encoding(this);
 7905     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7906     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7907   %}
 7908   ins_pipe( pipe_slow );
 7909 %}
 7910 
 7911 // --------------------------------- ABS --------------------------------------
 7912 // a = |a|
 7913 instruct vabsB_reg(vec dst, vec src) %{
 7914   match(Set dst (AbsVB  src));
 7915   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7916   ins_encode %{
 7917     uint vlen = Matcher::vector_length(this);
 7918     if (vlen <= 16) {
 7919       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7920     } else {
 7921       int vlen_enc = vector_length_encoding(this);
 7922       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7923     }
 7924   %}
 7925   ins_pipe( pipe_slow );
 7926 %}
 7927 
 7928 instruct vabsS_reg(vec dst, vec src) %{
 7929   match(Set dst (AbsVS  src));
 7930   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7931   ins_encode %{
 7932     uint vlen = Matcher::vector_length(this);
 7933     if (vlen <= 8) {
 7934       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7935     } else {
 7936       int vlen_enc = vector_length_encoding(this);
 7937       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7938     }
 7939   %}
 7940   ins_pipe( pipe_slow );
 7941 %}
 7942 
 7943 instruct vabsI_reg(vec dst, vec src) %{
 7944   match(Set dst (AbsVI  src));
 7945   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 7946   ins_encode %{
 7947     uint vlen = Matcher::vector_length(this);
 7948     if (vlen <= 4) {
 7949       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 7950     } else {
 7951       int vlen_enc = vector_length_encoding(this);
 7952       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7953     }
 7954   %}
 7955   ins_pipe( pipe_slow );
 7956 %}
 7957 
 7958 instruct vabsL_reg(vec dst, vec src) %{
 7959   match(Set dst (AbsVL  src));
 7960   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 7961   ins_encode %{
 7962     assert(UseAVX > 2, "required");
 7963     int vlen_enc = vector_length_encoding(this);
 7964     if (!VM_Version::supports_avx512vl()) {
 7965       vlen_enc = Assembler::AVX_512bit;
 7966     }
 7967     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7968   %}
 7969   ins_pipe( pipe_slow );
 7970 %}
 7971 
 7972 // --------------------------------- ABSNEG --------------------------------------
 7973 
 7974 instruct vabsnegF(vec dst, vec src) %{
 7975   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 7976   match(Set dst (AbsVF src));
 7977   match(Set dst (NegVF src));
 7978   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 7979   ins_cost(150);
 7980   ins_encode %{
 7981     int opcode = this->ideal_Opcode();
 7982     int vlen = Matcher::vector_length(this);
 7983     if (vlen == 2) {
 7984       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 7985     } else {
 7986       assert(vlen == 8 || vlen == 16, "required");
 7987       int vlen_enc = vector_length_encoding(this);
 7988       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7989     }
 7990   %}
 7991   ins_pipe( pipe_slow );
 7992 %}
 7993 
 7994 instruct vabsneg4F(vec dst) %{
 7995   predicate(Matcher::vector_length(n) == 4);
 7996   match(Set dst (AbsVF dst));
 7997   match(Set dst (NegVF dst));
 7998   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 7999   ins_cost(150);
 8000   ins_encode %{
 8001     int opcode = this->ideal_Opcode();
 8002     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8003   %}
 8004   ins_pipe( pipe_slow );
 8005 %}
 8006 
 8007 instruct vabsnegD(vec dst, vec src) %{
 8008   match(Set dst (AbsVD  src));
 8009   match(Set dst (NegVD  src));
 8010   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8011   ins_encode %{
 8012     int opcode = this->ideal_Opcode();
 8013     uint vlen = Matcher::vector_length(this);
 8014     if (vlen == 2) {
 8015       assert(UseSSE >= 2, "required");
 8016       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8017     } else {
 8018       int vlen_enc = vector_length_encoding(this);
 8019       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8020     }
 8021   %}
 8022   ins_pipe( pipe_slow );
 8023 %}
 8024 
 8025 //------------------------------------- VectorTest --------------------------------------------
 8026 
 8027 #ifdef _LP64
 8028 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8029   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8030   match(Set cr (VectorTest src1 src2));
 8031   effect(TEMP vtmp);
 8032   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8033   ins_encode %{
 8034     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8035     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8036     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8037   %}
 8038   ins_pipe( pipe_slow );
 8039 %}
 8040 
 8041 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8042   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8043   match(Set cr (VectorTest src1 src2));
 8044   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8045   ins_encode %{
 8046     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8047     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8048     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8049   %}
 8050   ins_pipe( pipe_slow );
 8051 %}
 8052 
 8053 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8054   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8055              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8056             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8057   match(Set cr (VectorTest src1 src2));
 8058   effect(TEMP tmp);
 8059   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8060   ins_encode %{
 8061     uint masklen = Matcher::vector_length(this, $src1);
 8062     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8063     __ andl($tmp$$Register, (1 << masklen) - 1);
 8064     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8065   %}
 8066   ins_pipe( pipe_slow );
 8067 %}
 8068 
 8069 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8070   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8071              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8072             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8073   match(Set cr (VectorTest src1 src2));
 8074   effect(TEMP tmp);
 8075   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8076   ins_encode %{
 8077     uint masklen = Matcher::vector_length(this, $src1);
 8078     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8079     __ andl($tmp$$Register, (1 << masklen) - 1);
 8080   %}
 8081   ins_pipe( pipe_slow );
 8082 %}
 8083 
 8084 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8085   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8086             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8087   match(Set cr (VectorTest src1 src2));
 8088   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8089   ins_encode %{
 8090     uint masklen = Matcher::vector_length(this, $src1);
 8091     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8092   %}
 8093   ins_pipe( pipe_slow );
 8094 %}
 8095 #endif
 8096 
 8097 //------------------------------------- LoadMask --------------------------------------------
 8098 
 8099 instruct loadMask(legVec dst, legVec src) %{
 8100   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8101   match(Set dst (VectorLoadMask src));
 8102   effect(TEMP dst);
 8103   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8104   ins_encode %{
 8105     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8106     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8107     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8108   %}
 8109   ins_pipe( pipe_slow );
 8110 %}
 8111 
 8112 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8113   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8114   match(Set dst (VectorLoadMask src));
 8115   effect(TEMP xtmp);
 8116   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8117   ins_encode %{
 8118     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8119                         true, Assembler::AVX_512bit);
 8120   %}
 8121   ins_pipe( pipe_slow );
 8122 %}
 8123 
 8124 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8125   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8126   match(Set dst (VectorLoadMask src));
 8127   effect(TEMP xtmp);
 8128   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8129   ins_encode %{
 8130     int vlen_enc = vector_length_encoding(in(1));
 8131     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8132                         false, vlen_enc);
 8133   %}
 8134   ins_pipe( pipe_slow );
 8135 %}
 8136 
 8137 //------------------------------------- StoreMask --------------------------------------------
 8138 
 8139 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8140   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8141   match(Set dst (VectorStoreMask src size));
 8142   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8143   ins_encode %{
 8144     int vlen = Matcher::vector_length(this);
 8145     if (vlen <= 16 && UseAVX <= 2) {
 8146       assert(UseSSE >= 3, "required");
 8147       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8148     } else {
 8149       assert(UseAVX > 0, "required");
 8150       int src_vlen_enc = vector_length_encoding(this, $src);
 8151       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8152     }
 8153   %}
 8154   ins_pipe( pipe_slow );
 8155 %}
 8156 
 8157 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8158   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8159   match(Set dst (VectorStoreMask src size));
 8160   effect(TEMP_DEF dst, TEMP xtmp);
 8161   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8162   ins_encode %{
 8163     int vlen_enc = Assembler::AVX_128bit;
 8164     int vlen = Matcher::vector_length(this);
 8165     if (vlen <= 8) {
 8166       assert(UseSSE >= 3, "required");
 8167       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8168       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8169       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8170     } else {
 8171       assert(UseAVX > 0, "required");
 8172       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8173       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8174       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8175     }
 8176   %}
 8177   ins_pipe( pipe_slow );
 8178 %}
 8179 
 8180 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8181   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8182   match(Set dst (VectorStoreMask src size));
 8183   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8184   effect(TEMP_DEF dst, TEMP xtmp);
 8185   ins_encode %{
 8186     int vlen_enc = Assembler::AVX_128bit;
 8187     int vlen = Matcher::vector_length(this);
 8188     if (vlen <= 4) {
 8189       assert(UseSSE >= 3, "required");
 8190       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8191       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8192       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8193       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8194     } else {
 8195       assert(UseAVX > 0, "required");
 8196       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8197       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8198       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8199       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8200       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8201     }
 8202   %}
 8203   ins_pipe( pipe_slow );
 8204 %}
 8205 
 8206 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8207   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8208   match(Set dst (VectorStoreMask src size));
 8209   effect(TEMP_DEF dst, TEMP xtmp);
 8210   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8211   ins_encode %{
 8212     assert(UseSSE >= 3, "required");
 8213     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8214     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8215     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8216     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8217     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8218   %}
 8219   ins_pipe( pipe_slow );
 8220 %}
 8221 
 8222 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8223   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8224   match(Set dst (VectorStoreMask src size));
 8225   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8226   effect(TEMP_DEF dst, TEMP vtmp);
 8227   ins_encode %{
 8228     int vlen_enc = Assembler::AVX_128bit;
 8229     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8230     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8231     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8232     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8233     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8234     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8235     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8236   %}
 8237   ins_pipe( pipe_slow );
 8238 %}
 8239 
 8240 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8241   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8242   match(Set dst (VectorStoreMask src size));
 8243   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8244   ins_encode %{
 8245     int src_vlen_enc = vector_length_encoding(this, $src);
 8246     int dst_vlen_enc = vector_length_encoding(this);
 8247     if (!VM_Version::supports_avx512vl()) {
 8248       src_vlen_enc = Assembler::AVX_512bit;
 8249     }
 8250     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8251     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8252   %}
 8253   ins_pipe( pipe_slow );
 8254 %}
 8255 
 8256 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8257   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8258   match(Set dst (VectorStoreMask src size));
 8259   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8260   ins_encode %{
 8261     int src_vlen_enc = vector_length_encoding(this, $src);
 8262     int dst_vlen_enc = vector_length_encoding(this);
 8263     if (!VM_Version::supports_avx512vl()) {
 8264       src_vlen_enc = Assembler::AVX_512bit;
 8265     }
 8266     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8267     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8268   %}
 8269   ins_pipe( pipe_slow );
 8270 %}
 8271 
 8272 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8273   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8274   match(Set dst (VectorStoreMask mask size));
 8275   effect(TEMP_DEF dst);
 8276   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8277   ins_encode %{
 8278     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8279     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8280                  false, Assembler::AVX_512bit, noreg);
 8281     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8282   %}
 8283   ins_pipe( pipe_slow );
 8284 %}
 8285 
 8286 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8287   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8288   match(Set dst (VectorStoreMask mask size));
 8289   effect(TEMP_DEF dst);
 8290   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8291   ins_encode %{
 8292     int dst_vlen_enc = vector_length_encoding(this);
 8293     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8294     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8295   %}
 8296   ins_pipe( pipe_slow );
 8297 %}
 8298 
 8299 instruct vmaskcast_evex(kReg dst) %{
 8300   match(Set dst (VectorMaskCast dst));
 8301   ins_cost(0);
 8302   format %{ "vector_mask_cast $dst" %}
 8303   ins_encode %{
 8304     // empty
 8305   %}
 8306   ins_pipe(empty);
 8307 %}
 8308 
 8309 instruct vmaskcast(vec dst) %{
 8310   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8311   match(Set dst (VectorMaskCast dst));
 8312   ins_cost(0);
 8313   format %{ "vector_mask_cast $dst" %}
 8314   ins_encode %{
 8315     // empty
 8316   %}
 8317   ins_pipe(empty);
 8318 %}
 8319 
 8320 instruct vmaskcast_avx(vec dst, vec src) %{
 8321   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8322   match(Set dst (VectorMaskCast src));
 8323   format %{ "vector_mask_cast $dst, $src" %}
 8324   ins_encode %{
 8325     int vlen = Matcher::vector_length(this);
 8326     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8327     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8328     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8329   %}
 8330   ins_pipe(pipe_slow);
 8331 %}
 8332 
 8333 //-------------------------------- Load Iota Indices ----------------------------------
 8334 
 8335 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8336   match(Set dst (VectorLoadConst src));
 8337   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8338   ins_encode %{
 8339      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8340      BasicType bt = Matcher::vector_element_basic_type(this);
 8341      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8342   %}
 8343   ins_pipe( pipe_slow );
 8344 %}
 8345 
 8346 #ifdef _LP64
 8347 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8348   match(Set dst (PopulateIndex src1 src2));
 8349   effect(TEMP dst, TEMP vtmp);
 8350   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8351   ins_encode %{
 8352      assert($src2$$constant == 1, "required");
 8353      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8354      int vlen_enc = vector_length_encoding(this);
 8355      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8356      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8357      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8358      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8359   %}
 8360   ins_pipe( pipe_slow );
 8361 %}
 8362 
 8363 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8364   match(Set dst (PopulateIndex src1 src2));
 8365   effect(TEMP dst, TEMP vtmp);
 8366   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8367   ins_encode %{
 8368      assert($src2$$constant == 1, "required");
 8369      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8370      int vlen_enc = vector_length_encoding(this);
 8371      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8372      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8373      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8374      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8375   %}
 8376   ins_pipe( pipe_slow );
 8377 %}
 8378 #endif
 8379 //-------------------------------- Rearrange ----------------------------------
 8380 
 8381 // LoadShuffle/Rearrange for Byte
 8382 
 8383 instruct loadShuffleB(vec dst) %{
 8384   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8385   match(Set dst (VectorLoadShuffle dst));
 8386   format %{ "vector_load_shuffle $dst, $dst" %}
 8387   ins_encode %{
 8388     // empty
 8389   %}
 8390   ins_pipe( pipe_slow );
 8391 %}
 8392 
 8393 instruct rearrangeB(vec dst, vec shuffle) %{
 8394   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8395             Matcher::vector_length(n) < 32);
 8396   match(Set dst (VectorRearrange dst shuffle));
 8397   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8398   ins_encode %{
 8399     assert(UseSSE >= 4, "required");
 8400     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8401   %}
 8402   ins_pipe( pipe_slow );
 8403 %}
 8404 
 8405 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8406   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8407             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8408   match(Set dst (VectorRearrange src shuffle));
 8409   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8410   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8411   ins_encode %{
 8412     assert(UseAVX >= 2, "required");
 8413     // Swap src into vtmp1
 8414     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8415     // Shuffle swapped src to get entries from other 128 bit lane
 8416     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8417     // Shuffle original src to get entries from self 128 bit lane
 8418     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8419     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8420     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8421     // Perform the blend
 8422     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8423   %}
 8424   ins_pipe( pipe_slow );
 8425 %}
 8426 
 8427 
 8428 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8429   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8430             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8431   match(Set dst (VectorRearrange src shuffle));
 8432   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8433   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8434   ins_encode %{
 8435     int vlen_enc = vector_length_encoding(this);
 8436     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8437                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8438                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8439   %}
 8440   ins_pipe( pipe_slow );
 8441 %}
 8442 
 8443 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8444   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8445             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8446   match(Set dst (VectorRearrange src shuffle));
 8447   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8448   ins_encode %{
 8449     int vlen_enc = vector_length_encoding(this);
 8450     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8451   %}
 8452   ins_pipe( pipe_slow );
 8453 %}
 8454 
 8455 // LoadShuffle/Rearrange for Short
 8456 
 8457 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8458   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8459             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8460   match(Set dst (VectorLoadShuffle src));
 8461   effect(TEMP dst, TEMP vtmp);
 8462   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8463   ins_encode %{
 8464     // Create a byte shuffle mask from short shuffle mask
 8465     // only byte shuffle instruction available on these platforms
 8466     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8467     if (UseAVX == 0) {
 8468       assert(vlen_in_bytes <= 16, "required");
 8469       // Multiply each shuffle by two to get byte index
 8470       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8471       __ psllw($vtmp$$XMMRegister, 1);
 8472 
 8473       // Duplicate to create 2 copies of byte index
 8474       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8475       __ psllw($dst$$XMMRegister, 8);
 8476       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8477 
 8478       // Add one to get alternate byte index
 8479       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8480       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8481     } else {
 8482       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8483       int vlen_enc = vector_length_encoding(this);
 8484       // Multiply each shuffle by two to get byte index
 8485       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8486       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8487 
 8488       // Duplicate to create 2 copies of byte index
 8489       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8490       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8491 
 8492       // Add one to get alternate byte index
 8493       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8494     }
 8495   %}
 8496   ins_pipe( pipe_slow );
 8497 %}
 8498 
 8499 instruct rearrangeS(vec dst, vec shuffle) %{
 8500   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8501             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8502   match(Set dst (VectorRearrange dst shuffle));
 8503   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8504   ins_encode %{
 8505     assert(UseSSE >= 4, "required");
 8506     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8507   %}
 8508   ins_pipe( pipe_slow );
 8509 %}
 8510 
 8511 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8512   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8513             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8514   match(Set dst (VectorRearrange src shuffle));
 8515   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8516   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8517   ins_encode %{
 8518     assert(UseAVX >= 2, "required");
 8519     // Swap src into vtmp1
 8520     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8521     // Shuffle swapped src to get entries from other 128 bit lane
 8522     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8523     // Shuffle original src to get entries from self 128 bit lane
 8524     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8525     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8526     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8527     // Perform the blend
 8528     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8529   %}
 8530   ins_pipe( pipe_slow );
 8531 %}
 8532 
 8533 instruct loadShuffleS_evex(vec dst, vec src) %{
 8534   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8535             VM_Version::supports_avx512bw());
 8536   match(Set dst (VectorLoadShuffle src));
 8537   format %{ "vector_load_shuffle $dst, $src" %}
 8538   ins_encode %{
 8539     int vlen_enc = vector_length_encoding(this);
 8540     if (!VM_Version::supports_avx512vl()) {
 8541       vlen_enc = Assembler::AVX_512bit;
 8542     }
 8543     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8544   %}
 8545   ins_pipe( pipe_slow );
 8546 %}
 8547 
 8548 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8549   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8550             VM_Version::supports_avx512bw());
 8551   match(Set dst (VectorRearrange src shuffle));
 8552   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8553   ins_encode %{
 8554     int vlen_enc = vector_length_encoding(this);
 8555     if (!VM_Version::supports_avx512vl()) {
 8556       vlen_enc = Assembler::AVX_512bit;
 8557     }
 8558     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8559   %}
 8560   ins_pipe( pipe_slow );
 8561 %}
 8562 
 8563 // LoadShuffle/Rearrange for Integer and Float
 8564 
 8565 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8566   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8567             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8568   match(Set dst (VectorLoadShuffle src));
 8569   effect(TEMP dst, TEMP vtmp);
 8570   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8571   ins_encode %{
 8572     assert(UseSSE >= 4, "required");
 8573 
 8574     // Create a byte shuffle mask from int shuffle mask
 8575     // only byte shuffle instruction available on these platforms
 8576 
 8577     // Duplicate and multiply each shuffle by 4
 8578     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8579     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8580     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8581     __ psllw($vtmp$$XMMRegister, 2);
 8582 
 8583     // Duplicate again to create 4 copies of byte index
 8584     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8585     __ psllw($dst$$XMMRegister, 8);
 8586     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8587 
 8588     // Add 3,2,1,0 to get alternate byte index
 8589     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8590     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8591   %}
 8592   ins_pipe( pipe_slow );
 8593 %}
 8594 
 8595 instruct rearrangeI(vec dst, vec shuffle) %{
 8596   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8597             UseAVX == 0);
 8598   match(Set dst (VectorRearrange dst shuffle));
 8599   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8600   ins_encode %{
 8601     assert(UseSSE >= 4, "required");
 8602     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8603   %}
 8604   ins_pipe( pipe_slow );
 8605 %}
 8606 
 8607 instruct loadShuffleI_avx(vec dst, vec src) %{
 8608   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8609             UseAVX > 0);
 8610   match(Set dst (VectorLoadShuffle src));
 8611   format %{ "vector_load_shuffle $dst, $src" %}
 8612   ins_encode %{
 8613     int vlen_enc = vector_length_encoding(this);
 8614     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8615   %}
 8616   ins_pipe( pipe_slow );
 8617 %}
 8618 
 8619 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8620   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8621             UseAVX > 0);
 8622   match(Set dst (VectorRearrange src shuffle));
 8623   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8624   ins_encode %{
 8625     int vlen_enc = vector_length_encoding(this);
 8626     BasicType bt = Matcher::vector_element_basic_type(this);
 8627     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8628   %}
 8629   ins_pipe( pipe_slow );
 8630 %}
 8631 
 8632 // LoadShuffle/Rearrange for Long and Double
 8633 
 8634 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8635   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8636             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8637   match(Set dst (VectorLoadShuffle src));
 8638   effect(TEMP dst, TEMP vtmp);
 8639   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8640   ins_encode %{
 8641     assert(UseAVX >= 2, "required");
 8642 
 8643     int vlen_enc = vector_length_encoding(this);
 8644     // Create a double word shuffle mask from long shuffle mask
 8645     // only double word shuffle instruction available on these platforms
 8646 
 8647     // Multiply each shuffle by two to get double word index
 8648     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8649     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8650 
 8651     // Duplicate each double word shuffle
 8652     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8653     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8654 
 8655     // Add one to get alternate double word index
 8656     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8657   %}
 8658   ins_pipe( pipe_slow );
 8659 %}
 8660 
 8661 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8662   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8663             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8664   match(Set dst (VectorRearrange src shuffle));
 8665   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8666   ins_encode %{
 8667     assert(UseAVX >= 2, "required");
 8668 
 8669     int vlen_enc = vector_length_encoding(this);
 8670     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8671   %}
 8672   ins_pipe( pipe_slow );
 8673 %}
 8674 
 8675 instruct loadShuffleL_evex(vec dst, vec src) %{
 8676   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8677             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8678   match(Set dst (VectorLoadShuffle src));
 8679   format %{ "vector_load_shuffle $dst, $src" %}
 8680   ins_encode %{
 8681     assert(UseAVX > 2, "required");
 8682 
 8683     int vlen_enc = vector_length_encoding(this);
 8684     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8685   %}
 8686   ins_pipe( pipe_slow );
 8687 %}
 8688 
 8689 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8690   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8691             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8692   match(Set dst (VectorRearrange src shuffle));
 8693   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8694   ins_encode %{
 8695     assert(UseAVX > 2, "required");
 8696 
 8697     int vlen_enc = vector_length_encoding(this);
 8698     if (vlen_enc == Assembler::AVX_128bit) {
 8699       vlen_enc = Assembler::AVX_256bit;
 8700     }
 8701     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8702   %}
 8703   ins_pipe( pipe_slow );
 8704 %}
 8705 
 8706 // --------------------------------- FMA --------------------------------------
 8707 // a * b + c
 8708 
 8709 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8710   match(Set c (FmaVF  c (Binary a b)));
 8711   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8712   ins_cost(150);
 8713   ins_encode %{
 8714     assert(UseFMA, "not enabled");
 8715     int vlen_enc = vector_length_encoding(this);
 8716     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8717   %}
 8718   ins_pipe( pipe_slow );
 8719 %}
 8720 
 8721 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8722   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8723   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8724   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8725   ins_cost(150);
 8726   ins_encode %{
 8727     assert(UseFMA, "not enabled");
 8728     int vlen_enc = vector_length_encoding(this);
 8729     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8730   %}
 8731   ins_pipe( pipe_slow );
 8732 %}
 8733 
 8734 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8735   match(Set c (FmaVD  c (Binary a b)));
 8736   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8737   ins_cost(150);
 8738   ins_encode %{
 8739     assert(UseFMA, "not enabled");
 8740     int vlen_enc = vector_length_encoding(this);
 8741     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8742   %}
 8743   ins_pipe( pipe_slow );
 8744 %}
 8745 
 8746 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8747   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8748   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8749   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8750   ins_cost(150);
 8751   ins_encode %{
 8752     assert(UseFMA, "not enabled");
 8753     int vlen_enc = vector_length_encoding(this);
 8754     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8755   %}
 8756   ins_pipe( pipe_slow );
 8757 %}
 8758 
 8759 // --------------------------------- Vector Multiply Add --------------------------------------
 8760 
 8761 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8762   predicate(UseAVX == 0);
 8763   match(Set dst (MulAddVS2VI dst src1));
 8764   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8765   ins_encode %{
 8766     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8767   %}
 8768   ins_pipe( pipe_slow );
 8769 %}
 8770 
 8771 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8772   predicate(UseAVX > 0);
 8773   match(Set dst (MulAddVS2VI src1 src2));
 8774   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8775   ins_encode %{
 8776     int vlen_enc = vector_length_encoding(this);
 8777     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8778   %}
 8779   ins_pipe( pipe_slow );
 8780 %}
 8781 
 8782 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8783 
 8784 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8785   predicate(VM_Version::supports_avx512_vnni());
 8786   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8787   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8788   ins_encode %{
 8789     assert(UseAVX > 2, "required");
 8790     int vlen_enc = vector_length_encoding(this);
 8791     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8792   %}
 8793   ins_pipe( pipe_slow );
 8794   ins_cost(10);
 8795 %}
 8796 
 8797 // --------------------------------- PopCount --------------------------------------
 8798 
 8799 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8800   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8801   match(Set dst (PopCountVI src));
 8802   match(Set dst (PopCountVL src));
 8803   format %{ "vector_popcount_integral $dst, $src" %}
 8804   ins_encode %{
 8805     int opcode = this->ideal_Opcode();
 8806     int vlen_enc = vector_length_encoding(this, $src);
 8807     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8808     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8809   %}
 8810   ins_pipe( pipe_slow );
 8811 %}
 8812 
 8813 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8814   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8815   match(Set dst (PopCountVI src mask));
 8816   match(Set dst (PopCountVL src mask));
 8817   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8818   ins_encode %{
 8819     int vlen_enc = vector_length_encoding(this, $src);
 8820     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8821     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8822     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8823   %}
 8824   ins_pipe( pipe_slow );
 8825 %}
 8826 
 8827 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8828   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8829   match(Set dst (PopCountVI src));
 8830   match(Set dst (PopCountVL src));
 8831   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8832   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8833   ins_encode %{
 8834     int opcode = this->ideal_Opcode();
 8835     int vlen_enc = vector_length_encoding(this, $src);
 8836     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8837     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8838                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8839   %}
 8840   ins_pipe( pipe_slow );
 8841 %}
 8842 
 8843 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8844 
 8845 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8846   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8847                                               Matcher::vector_length_in_bytes(n->in(1))));
 8848   match(Set dst (CountTrailingZerosV src));
 8849   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8850   ins_cost(400);
 8851   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8852   ins_encode %{
 8853     int vlen_enc = vector_length_encoding(this, $src);
 8854     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8855     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8856                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8857   %}
 8858   ins_pipe( pipe_slow );
 8859 %}
 8860 
 8861 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8862   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8863             VM_Version::supports_avx512cd() &&
 8864             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8865   match(Set dst (CountTrailingZerosV src));
 8866   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8867   ins_cost(400);
 8868   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8869   ins_encode %{
 8870     int vlen_enc = vector_length_encoding(this, $src);
 8871     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8872     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8873                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8874   %}
 8875   ins_pipe( pipe_slow );
 8876 %}
 8877 
 8878 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8879   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8880   match(Set dst (CountTrailingZerosV src));
 8881   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8882   ins_cost(400);
 8883   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8884   ins_encode %{
 8885     int vlen_enc = vector_length_encoding(this, $src);
 8886     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8887     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8888                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8889                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8890   %}
 8891   ins_pipe( pipe_slow );
 8892 %}
 8893 
 8894 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8895   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8896   match(Set dst (CountTrailingZerosV src));
 8897   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8898   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8899   ins_encode %{
 8900     int vlen_enc = vector_length_encoding(this, $src);
 8901     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8902     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8903                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8904   %}
 8905   ins_pipe( pipe_slow );
 8906 %}
 8907 
 8908 
 8909 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8910 
 8911 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8912   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8913   effect(TEMP dst);
 8914   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8915   ins_encode %{
 8916     int vector_len = vector_length_encoding(this);
 8917     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8918   %}
 8919   ins_pipe( pipe_slow );
 8920 %}
 8921 
 8922 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8923   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8924   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8925   effect(TEMP dst);
 8926   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8927   ins_encode %{
 8928     int vector_len = vector_length_encoding(this);
 8929     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8930   %}
 8931   ins_pipe( pipe_slow );
 8932 %}
 8933 
 8934 // --------------------------------- Rotation Operations ----------------------------------
 8935 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8936   match(Set dst (RotateLeftV src shift));
 8937   match(Set dst (RotateRightV src shift));
 8938   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8939   ins_encode %{
 8940     int opcode      = this->ideal_Opcode();
 8941     int vector_len  = vector_length_encoding(this);
 8942     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8943     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 8944   %}
 8945   ins_pipe( pipe_slow );
 8946 %}
 8947 
 8948 instruct vprorate(vec dst, vec src, vec shift) %{
 8949   match(Set dst (RotateLeftV src shift));
 8950   match(Set dst (RotateRightV src shift));
 8951   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 8952   ins_encode %{
 8953     int opcode      = this->ideal_Opcode();
 8954     int vector_len  = vector_length_encoding(this);
 8955     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8956     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 8957   %}
 8958   ins_pipe( pipe_slow );
 8959 %}
 8960 
 8961 // ---------------------------------- Masked Operations ------------------------------------
 8962 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 8963   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 8964   match(Set dst (LoadVectorMasked mem mask));
 8965   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8966   ins_encode %{
 8967     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 8968     int vlen_enc = vector_length_encoding(this);
 8969     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 8970   %}
 8971   ins_pipe( pipe_slow );
 8972 %}
 8973 
 8974 
 8975 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 8976   predicate(n->in(3)->bottom_type()->isa_vectmask());
 8977   match(Set dst (LoadVectorMasked mem mask));
 8978   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8979   ins_encode %{
 8980     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 8981     int vector_len = vector_length_encoding(this);
 8982     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 8983   %}
 8984   ins_pipe( pipe_slow );
 8985 %}
 8986 
 8987 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 8988   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8989   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8990   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8991   ins_encode %{
 8992     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8993     int vlen_enc = vector_length_encoding(src_node);
 8994     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8995     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8996   %}
 8997   ins_pipe( pipe_slow );
 8998 %}
 8999 
 9000 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9001   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9002   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9003   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9004   ins_encode %{
 9005     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9006     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9007     int vlen_enc = vector_length_encoding(src_node);
 9008     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9009   %}
 9010   ins_pipe( pipe_slow );
 9011 %}
 9012 
 9013 #ifdef _LP64
 9014 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9015   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9016   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9017   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9018   ins_encode %{
 9019     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9020     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9021 
 9022     Label DONE;
 9023     int vlen_enc = vector_length_encoding(this, $src1);
 9024     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9025 
 9026     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9027     __ mov64($dst$$Register, -1L);
 9028     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9029     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9030     __ jccb(Assembler::carrySet, DONE);
 9031     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9032     __ notq($dst$$Register);
 9033     __ tzcntq($dst$$Register, $dst$$Register);
 9034     __ bind(DONE);
 9035   %}
 9036   ins_pipe( pipe_slow );
 9037 %}
 9038 
 9039 
 9040 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9041   match(Set dst (VectorMaskGen len));
 9042   effect(TEMP temp, KILL cr);
 9043   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9044   ins_encode %{
 9045     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9046   %}
 9047   ins_pipe( pipe_slow );
 9048 %}
 9049 
 9050 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9051   match(Set dst (VectorMaskGen len));
 9052   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9053   effect(TEMP temp);
 9054   ins_encode %{
 9055     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9056     __ kmovql($dst$$KRegister, $temp$$Register);
 9057   %}
 9058   ins_pipe( pipe_slow );
 9059 %}
 9060 
 9061 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9062   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9063   match(Set dst (VectorMaskToLong mask));
 9064   effect(TEMP dst, KILL cr);
 9065   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9066   ins_encode %{
 9067     int opcode = this->ideal_Opcode();
 9068     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9069     int mask_len = Matcher::vector_length(this, $mask);
 9070     int mask_size = mask_len * type2aelembytes(mbt);
 9071     int vlen_enc = vector_length_encoding(this, $mask);
 9072     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9073                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9074   %}
 9075   ins_pipe( pipe_slow );
 9076 %}
 9077 
 9078 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9079   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9080   match(Set dst (VectorMaskToLong mask));
 9081   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9082   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9083   ins_encode %{
 9084     int opcode = this->ideal_Opcode();
 9085     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9086     int mask_len = Matcher::vector_length(this, $mask);
 9087     int vlen_enc = vector_length_encoding(this, $mask);
 9088     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9089                              $dst$$Register, mask_len, mbt, vlen_enc);
 9090   %}
 9091   ins_pipe( pipe_slow );
 9092 %}
 9093 
 9094 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9095   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9096   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9097   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9098   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9099   ins_encode %{
 9100     int opcode = this->ideal_Opcode();
 9101     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9102     int mask_len = Matcher::vector_length(this, $mask);
 9103     int vlen_enc = vector_length_encoding(this, $mask);
 9104     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9105                              $dst$$Register, mask_len, mbt, vlen_enc);
 9106   %}
 9107   ins_pipe( pipe_slow );
 9108 %}
 9109 
 9110 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9111   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9112   match(Set dst (VectorMaskTrueCount mask));
 9113   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9114   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9115   ins_encode %{
 9116     int opcode = this->ideal_Opcode();
 9117     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9118     int mask_len = Matcher::vector_length(this, $mask);
 9119     int mask_size = mask_len * type2aelembytes(mbt);
 9120     int vlen_enc = vector_length_encoding(this, $mask);
 9121     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9122                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9123   %}
 9124   ins_pipe( pipe_slow );
 9125 %}
 9126 
 9127 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9128   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9129   match(Set dst (VectorMaskTrueCount mask));
 9130   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9131   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9132   ins_encode %{
 9133     int opcode = this->ideal_Opcode();
 9134     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9135     int mask_len = Matcher::vector_length(this, $mask);
 9136     int vlen_enc = vector_length_encoding(this, $mask);
 9137     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9138                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9139   %}
 9140   ins_pipe( pipe_slow );
 9141 %}
 9142 
 9143 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9144   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9145   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9146   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9147   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9148   ins_encode %{
 9149     int opcode = this->ideal_Opcode();
 9150     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9151     int mask_len = Matcher::vector_length(this, $mask);
 9152     int vlen_enc = vector_length_encoding(this, $mask);
 9153     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9154                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9155   %}
 9156   ins_pipe( pipe_slow );
 9157 %}
 9158 
 9159 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9160   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9161   match(Set dst (VectorMaskFirstTrue mask));
 9162   match(Set dst (VectorMaskLastTrue mask));
 9163   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9164   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9165   ins_encode %{
 9166     int opcode = this->ideal_Opcode();
 9167     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9168     int mask_len = Matcher::vector_length(this, $mask);
 9169     int mask_size = mask_len * type2aelembytes(mbt);
 9170     int vlen_enc = vector_length_encoding(this, $mask);
 9171     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9172                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9173   %}
 9174   ins_pipe( pipe_slow );
 9175 %}
 9176 
 9177 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9178   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9179   match(Set dst (VectorMaskFirstTrue mask));
 9180   match(Set dst (VectorMaskLastTrue mask));
 9181   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9182   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9183   ins_encode %{
 9184     int opcode = this->ideal_Opcode();
 9185     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9186     int mask_len = Matcher::vector_length(this, $mask);
 9187     int vlen_enc = vector_length_encoding(this, $mask);
 9188     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9189                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9190   %}
 9191   ins_pipe( pipe_slow );
 9192 %}
 9193 
 9194 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9195   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9196   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9197   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9198   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9199   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9200   ins_encode %{
 9201     int opcode = this->ideal_Opcode();
 9202     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9203     int mask_len = Matcher::vector_length(this, $mask);
 9204     int vlen_enc = vector_length_encoding(this, $mask);
 9205     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9206                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9207   %}
 9208   ins_pipe( pipe_slow );
 9209 %}
 9210 
 9211 // --------------------------------- Compress/Expand Operations ---------------------------
 9212 
 9213 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9214   match(Set dst (CompressV src mask));
 9215   match(Set dst (ExpandV src mask));
 9216   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9217   ins_encode %{
 9218     int opcode = this->ideal_Opcode();
 9219     int vector_len = vector_length_encoding(this);
 9220     BasicType bt  = Matcher::vector_element_basic_type(this);
 9221     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9222   %}
 9223   ins_pipe( pipe_slow );
 9224 %}
 9225 
 9226 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9227   match(Set dst (CompressM mask));
 9228   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9229   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9230   ins_encode %{
 9231     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9232     int mask_len = Matcher::vector_length(this);
 9233     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9234   %}
 9235   ins_pipe( pipe_slow );
 9236 %}
 9237 
 9238 #endif // _LP64
 9239 
 9240 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9241 
 9242 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9243   predicate(!VM_Version::supports_gfni());
 9244   match(Set dst (ReverseV src));
 9245   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9246   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9247   ins_encode %{
 9248     int vec_enc = vector_length_encoding(this);
 9249     BasicType bt = Matcher::vector_element_basic_type(this);
 9250     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9251                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9252   %}
 9253   ins_pipe( pipe_slow );
 9254 %}
 9255 
 9256 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9257   predicate(VM_Version::supports_gfni());
 9258   match(Set dst (ReverseV src));
 9259   effect(TEMP dst, TEMP xtmp);
 9260   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9261   ins_encode %{
 9262     int vec_enc = vector_length_encoding(this);
 9263     BasicType bt  = Matcher::vector_element_basic_type(this);
 9264     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9265     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9266                                $xtmp$$XMMRegister);
 9267   %}
 9268   ins_pipe( pipe_slow );
 9269 %}
 9270 
 9271 instruct vreverse_byte_reg(vec dst, vec src) %{
 9272   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9273   match(Set dst (ReverseBytesV src));
 9274   effect(TEMP dst);
 9275   format %{ "vector_reverse_byte $dst, $src" %}
 9276   ins_encode %{
 9277     int vec_enc = vector_length_encoding(this);
 9278     BasicType bt = Matcher::vector_element_basic_type(this);
 9279     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9280   %}
 9281   ins_pipe( pipe_slow );
 9282 %}
 9283 
 9284 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9285   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9286   match(Set dst (ReverseBytesV src));
 9287   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9288   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9289   ins_encode %{
 9290     int vec_enc = vector_length_encoding(this);
 9291     BasicType bt = Matcher::vector_element_basic_type(this);
 9292     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9293                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9294   %}
 9295   ins_pipe( pipe_slow );
 9296 %}
 9297 
 9298 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9299 
 9300 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9301   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9302                                               Matcher::vector_length_in_bytes(n->in(1))));
 9303   match(Set dst (CountLeadingZerosV src));
 9304   format %{ "vector_count_leading_zeros $dst, $src" %}
 9305   ins_encode %{
 9306      int vlen_enc = vector_length_encoding(this, $src);
 9307      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9308      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9309                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9310   %}
 9311   ins_pipe( pipe_slow );
 9312 %}
 9313 
 9314 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9315   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9316                                               Matcher::vector_length_in_bytes(n->in(1))));
 9317   match(Set dst (CountLeadingZerosV src mask));
 9318   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9319   ins_encode %{
 9320     int vlen_enc = vector_length_encoding(this, $src);
 9321     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9322     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9323     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9324                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9325   %}
 9326   ins_pipe( pipe_slow );
 9327 %}
 9328 
 9329 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9330   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9331             VM_Version::supports_avx512cd() &&
 9332             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9333   match(Set dst (CountLeadingZerosV src));
 9334   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9335   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9336   ins_encode %{
 9337     int vlen_enc = vector_length_encoding(this, $src);
 9338     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9339     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9340                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9341   %}
 9342   ins_pipe( pipe_slow );
 9343 %}
 9344 
 9345 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9346   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9347   match(Set dst (CountLeadingZerosV src));
 9348   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9349   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9350   ins_encode %{
 9351     int vlen_enc = vector_length_encoding(this, $src);
 9352     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9353     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9354                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9355                                        $rtmp$$Register, true, vlen_enc);
 9356   %}
 9357   ins_pipe( pipe_slow );
 9358 %}
 9359 
 9360 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9361   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9362             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9363   match(Set dst (CountLeadingZerosV src));
 9364   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9365   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9366   ins_encode %{
 9367     int vlen_enc = vector_length_encoding(this, $src);
 9368     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9369     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9370                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9371   %}
 9372   ins_pipe( pipe_slow );
 9373 %}
 9374 
 9375 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9376   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9377             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9378   match(Set dst (CountLeadingZerosV src));
 9379   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9380   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9381   ins_encode %{
 9382     int vlen_enc = vector_length_encoding(this, $src);
 9383     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9384     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9385                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9386   %}
 9387   ins_pipe( pipe_slow );
 9388 %}
 9389 
 9390 // ---------------------------------- Vector Masked Operations ------------------------------------
 9391 
 9392 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9393   match(Set dst (AddVB (Binary dst src2) mask));
 9394   match(Set dst (AddVS (Binary dst src2) mask));
 9395   match(Set dst (AddVI (Binary dst src2) mask));
 9396   match(Set dst (AddVL (Binary dst src2) mask));
 9397   match(Set dst (AddVF (Binary dst src2) mask));
 9398   match(Set dst (AddVD (Binary dst src2) mask));
 9399   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9400   ins_encode %{
 9401     int vlen_enc = vector_length_encoding(this);
 9402     BasicType bt = Matcher::vector_element_basic_type(this);
 9403     int opc = this->ideal_Opcode();
 9404     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9405                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9406   %}
 9407   ins_pipe( pipe_slow );
 9408 %}
 9409 
 9410 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9411   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9412   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9413   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9414   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9415   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9416   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9417   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9418   ins_encode %{
 9419     int vlen_enc = vector_length_encoding(this);
 9420     BasicType bt = Matcher::vector_element_basic_type(this);
 9421     int opc = this->ideal_Opcode();
 9422     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9423                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9424   %}
 9425   ins_pipe( pipe_slow );
 9426 %}
 9427 
 9428 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9429   match(Set dst (XorV (Binary dst src2) mask));
 9430   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9431   ins_encode %{
 9432     int vlen_enc = vector_length_encoding(this);
 9433     BasicType bt = Matcher::vector_element_basic_type(this);
 9434     int opc = this->ideal_Opcode();
 9435     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9436                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9437   %}
 9438   ins_pipe( pipe_slow );
 9439 %}
 9440 
 9441 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9442   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9443   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9444   ins_encode %{
 9445     int vlen_enc = vector_length_encoding(this);
 9446     BasicType bt = Matcher::vector_element_basic_type(this);
 9447     int opc = this->ideal_Opcode();
 9448     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9449                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9450   %}
 9451   ins_pipe( pipe_slow );
 9452 %}
 9453 
 9454 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9455   match(Set dst (OrV (Binary dst src2) mask));
 9456   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9457   ins_encode %{
 9458     int vlen_enc = vector_length_encoding(this);
 9459     BasicType bt = Matcher::vector_element_basic_type(this);
 9460     int opc = this->ideal_Opcode();
 9461     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9462                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9463   %}
 9464   ins_pipe( pipe_slow );
 9465 %}
 9466 
 9467 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9468   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9469   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9470   ins_encode %{
 9471     int vlen_enc = vector_length_encoding(this);
 9472     BasicType bt = Matcher::vector_element_basic_type(this);
 9473     int opc = this->ideal_Opcode();
 9474     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9475                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9476   %}
 9477   ins_pipe( pipe_slow );
 9478 %}
 9479 
 9480 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9481   match(Set dst (AndV (Binary dst src2) mask));
 9482   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9483   ins_encode %{
 9484     int vlen_enc = vector_length_encoding(this);
 9485     BasicType bt = Matcher::vector_element_basic_type(this);
 9486     int opc = this->ideal_Opcode();
 9487     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9488                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9489   %}
 9490   ins_pipe( pipe_slow );
 9491 %}
 9492 
 9493 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9494   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9495   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9496   ins_encode %{
 9497     int vlen_enc = vector_length_encoding(this);
 9498     BasicType bt = Matcher::vector_element_basic_type(this);
 9499     int opc = this->ideal_Opcode();
 9500     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9501                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9502   %}
 9503   ins_pipe( pipe_slow );
 9504 %}
 9505 
 9506 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9507   match(Set dst (SubVB (Binary dst src2) mask));
 9508   match(Set dst (SubVS (Binary dst src2) mask));
 9509   match(Set dst (SubVI (Binary dst src2) mask));
 9510   match(Set dst (SubVL (Binary dst src2) mask));
 9511   match(Set dst (SubVF (Binary dst src2) mask));
 9512   match(Set dst (SubVD (Binary dst src2) mask));
 9513   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9514   ins_encode %{
 9515     int vlen_enc = vector_length_encoding(this);
 9516     BasicType bt = Matcher::vector_element_basic_type(this);
 9517     int opc = this->ideal_Opcode();
 9518     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9519                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9520   %}
 9521   ins_pipe( pipe_slow );
 9522 %}
 9523 
 9524 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9525   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9526   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9527   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9528   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9529   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9530   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9531   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9532   ins_encode %{
 9533     int vlen_enc = vector_length_encoding(this);
 9534     BasicType bt = Matcher::vector_element_basic_type(this);
 9535     int opc = this->ideal_Opcode();
 9536     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9537                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9538   %}
 9539   ins_pipe( pipe_slow );
 9540 %}
 9541 
 9542 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9543   match(Set dst (MulVS (Binary dst src2) mask));
 9544   match(Set dst (MulVI (Binary dst src2) mask));
 9545   match(Set dst (MulVL (Binary dst src2) mask));
 9546   match(Set dst (MulVF (Binary dst src2) mask));
 9547   match(Set dst (MulVD (Binary dst src2) mask));
 9548   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9549   ins_encode %{
 9550     int vlen_enc = vector_length_encoding(this);
 9551     BasicType bt = Matcher::vector_element_basic_type(this);
 9552     int opc = this->ideal_Opcode();
 9553     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9554                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9555   %}
 9556   ins_pipe( pipe_slow );
 9557 %}
 9558 
 9559 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9560   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9561   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9562   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9563   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9564   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9565   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9566   ins_encode %{
 9567     int vlen_enc = vector_length_encoding(this);
 9568     BasicType bt = Matcher::vector_element_basic_type(this);
 9569     int opc = this->ideal_Opcode();
 9570     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9571                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9572   %}
 9573   ins_pipe( pipe_slow );
 9574 %}
 9575 
 9576 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9577   match(Set dst (SqrtVF dst mask));
 9578   match(Set dst (SqrtVD dst mask));
 9579   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9580   ins_encode %{
 9581     int vlen_enc = vector_length_encoding(this);
 9582     BasicType bt = Matcher::vector_element_basic_type(this);
 9583     int opc = this->ideal_Opcode();
 9584     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9585                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9586   %}
 9587   ins_pipe( pipe_slow );
 9588 %}
 9589 
 9590 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9591   match(Set dst (DivVF (Binary dst src2) mask));
 9592   match(Set dst (DivVD (Binary dst src2) mask));
 9593   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9594   ins_encode %{
 9595     int vlen_enc = vector_length_encoding(this);
 9596     BasicType bt = Matcher::vector_element_basic_type(this);
 9597     int opc = this->ideal_Opcode();
 9598     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9599                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9600   %}
 9601   ins_pipe( pipe_slow );
 9602 %}
 9603 
 9604 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9605   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9606   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9607   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9608   ins_encode %{
 9609     int vlen_enc = vector_length_encoding(this);
 9610     BasicType bt = Matcher::vector_element_basic_type(this);
 9611     int opc = this->ideal_Opcode();
 9612     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9613                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9614   %}
 9615   ins_pipe( pipe_slow );
 9616 %}
 9617 
 9618 
 9619 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9620   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9621   match(Set dst (RotateRightV (Binary dst shift) mask));
 9622   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9623   ins_encode %{
 9624     int vlen_enc = vector_length_encoding(this);
 9625     BasicType bt = Matcher::vector_element_basic_type(this);
 9626     int opc = this->ideal_Opcode();
 9627     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9628                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9629   %}
 9630   ins_pipe( pipe_slow );
 9631 %}
 9632 
 9633 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9634   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9635   match(Set dst (RotateRightV (Binary dst src2) mask));
 9636   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9637   ins_encode %{
 9638     int vlen_enc = vector_length_encoding(this);
 9639     BasicType bt = Matcher::vector_element_basic_type(this);
 9640     int opc = this->ideal_Opcode();
 9641     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9642                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9643   %}
 9644   ins_pipe( pipe_slow );
 9645 %}
 9646 
 9647 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9648   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9649   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9650   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9651   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9652   ins_encode %{
 9653     int vlen_enc = vector_length_encoding(this);
 9654     BasicType bt = Matcher::vector_element_basic_type(this);
 9655     int opc = this->ideal_Opcode();
 9656     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9657                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9658   %}
 9659   ins_pipe( pipe_slow );
 9660 %}
 9661 
 9662 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9663   predicate(!n->as_ShiftV()->is_var_shift());
 9664   match(Set dst (LShiftVS (Binary dst src2) mask));
 9665   match(Set dst (LShiftVI (Binary dst src2) mask));
 9666   match(Set dst (LShiftVL (Binary dst src2) mask));
 9667   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9668   ins_encode %{
 9669     int vlen_enc = vector_length_encoding(this);
 9670     BasicType bt = Matcher::vector_element_basic_type(this);
 9671     int opc = this->ideal_Opcode();
 9672     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9673                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9674   %}
 9675   ins_pipe( pipe_slow );
 9676 %}
 9677 
 9678 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9679   predicate(n->as_ShiftV()->is_var_shift());
 9680   match(Set dst (LShiftVS (Binary dst src2) mask));
 9681   match(Set dst (LShiftVI (Binary dst src2) mask));
 9682   match(Set dst (LShiftVL (Binary dst src2) mask));
 9683   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9684   ins_encode %{
 9685     int vlen_enc = vector_length_encoding(this);
 9686     BasicType bt = Matcher::vector_element_basic_type(this);
 9687     int opc = this->ideal_Opcode();
 9688     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9689                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9690   %}
 9691   ins_pipe( pipe_slow );
 9692 %}
 9693 
 9694 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9695   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9696   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9697   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9698   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9699   ins_encode %{
 9700     int vlen_enc = vector_length_encoding(this);
 9701     BasicType bt = Matcher::vector_element_basic_type(this);
 9702     int opc = this->ideal_Opcode();
 9703     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9704                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9705   %}
 9706   ins_pipe( pipe_slow );
 9707 %}
 9708 
 9709 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9710   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9711   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9712   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9713   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9714   ins_encode %{
 9715     int vlen_enc = vector_length_encoding(this);
 9716     BasicType bt = Matcher::vector_element_basic_type(this);
 9717     int opc = this->ideal_Opcode();
 9718     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9719                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9720   %}
 9721   ins_pipe( pipe_slow );
 9722 %}
 9723 
 9724 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9725   predicate(!n->as_ShiftV()->is_var_shift());
 9726   match(Set dst (RShiftVS (Binary dst src2) mask));
 9727   match(Set dst (RShiftVI (Binary dst src2) mask));
 9728   match(Set dst (RShiftVL (Binary dst src2) mask));
 9729   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9730   ins_encode %{
 9731     int vlen_enc = vector_length_encoding(this);
 9732     BasicType bt = Matcher::vector_element_basic_type(this);
 9733     int opc = this->ideal_Opcode();
 9734     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9735                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9736   %}
 9737   ins_pipe( pipe_slow );
 9738 %}
 9739 
 9740 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9741   predicate(n->as_ShiftV()->is_var_shift());
 9742   match(Set dst (RShiftVS (Binary dst src2) mask));
 9743   match(Set dst (RShiftVI (Binary dst src2) mask));
 9744   match(Set dst (RShiftVL (Binary dst src2) mask));
 9745   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9746   ins_encode %{
 9747     int vlen_enc = vector_length_encoding(this);
 9748     BasicType bt = Matcher::vector_element_basic_type(this);
 9749     int opc = this->ideal_Opcode();
 9750     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9751                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9752   %}
 9753   ins_pipe( pipe_slow );
 9754 %}
 9755 
 9756 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9757   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9758   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9759   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9760   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9761   ins_encode %{
 9762     int vlen_enc = vector_length_encoding(this);
 9763     BasicType bt = Matcher::vector_element_basic_type(this);
 9764     int opc = this->ideal_Opcode();
 9765     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9766                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9767   %}
 9768   ins_pipe( pipe_slow );
 9769 %}
 9770 
 9771 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9772   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9773   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9774   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9775   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9776   ins_encode %{
 9777     int vlen_enc = vector_length_encoding(this);
 9778     BasicType bt = Matcher::vector_element_basic_type(this);
 9779     int opc = this->ideal_Opcode();
 9780     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9781                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9782   %}
 9783   ins_pipe( pipe_slow );
 9784 %}
 9785 
 9786 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9787   predicate(!n->as_ShiftV()->is_var_shift());
 9788   match(Set dst (URShiftVS (Binary dst src2) mask));
 9789   match(Set dst (URShiftVI (Binary dst src2) mask));
 9790   match(Set dst (URShiftVL (Binary dst src2) mask));
 9791   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9792   ins_encode %{
 9793     int vlen_enc = vector_length_encoding(this);
 9794     BasicType bt = Matcher::vector_element_basic_type(this);
 9795     int opc = this->ideal_Opcode();
 9796     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9797                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9798   %}
 9799   ins_pipe( pipe_slow );
 9800 %}
 9801 
 9802 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9803   predicate(n->as_ShiftV()->is_var_shift());
 9804   match(Set dst (URShiftVS (Binary dst src2) mask));
 9805   match(Set dst (URShiftVI (Binary dst src2) mask));
 9806   match(Set dst (URShiftVL (Binary dst src2) mask));
 9807   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9808   ins_encode %{
 9809     int vlen_enc = vector_length_encoding(this);
 9810     BasicType bt = Matcher::vector_element_basic_type(this);
 9811     int opc = this->ideal_Opcode();
 9812     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9813                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9814   %}
 9815   ins_pipe( pipe_slow );
 9816 %}
 9817 
 9818 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9819   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9820   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9821   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9822   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9823   ins_encode %{
 9824     int vlen_enc = vector_length_encoding(this);
 9825     BasicType bt = Matcher::vector_element_basic_type(this);
 9826     int opc = this->ideal_Opcode();
 9827     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9828                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9829   %}
 9830   ins_pipe( pipe_slow );
 9831 %}
 9832 
 9833 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9834   match(Set dst (MaxV (Binary dst src2) mask));
 9835   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9836   ins_encode %{
 9837     int vlen_enc = vector_length_encoding(this);
 9838     BasicType bt = Matcher::vector_element_basic_type(this);
 9839     int opc = this->ideal_Opcode();
 9840     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9841                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9842   %}
 9843   ins_pipe( pipe_slow );
 9844 %}
 9845 
 9846 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9847   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9848   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9849   ins_encode %{
 9850     int vlen_enc = vector_length_encoding(this);
 9851     BasicType bt = Matcher::vector_element_basic_type(this);
 9852     int opc = this->ideal_Opcode();
 9853     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9854                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9855   %}
 9856   ins_pipe( pipe_slow );
 9857 %}
 9858 
 9859 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9860   match(Set dst (MinV (Binary dst src2) mask));
 9861   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9862   ins_encode %{
 9863     int vlen_enc = vector_length_encoding(this);
 9864     BasicType bt = Matcher::vector_element_basic_type(this);
 9865     int opc = this->ideal_Opcode();
 9866     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9867                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9868   %}
 9869   ins_pipe( pipe_slow );
 9870 %}
 9871 
 9872 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9873   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9874   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9875   ins_encode %{
 9876     int vlen_enc = vector_length_encoding(this);
 9877     BasicType bt = Matcher::vector_element_basic_type(this);
 9878     int opc = this->ideal_Opcode();
 9879     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9880                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9881   %}
 9882   ins_pipe( pipe_slow );
 9883 %}
 9884 
 9885 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9886   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9887   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9888   ins_encode %{
 9889     int vlen_enc = vector_length_encoding(this);
 9890     BasicType bt = Matcher::vector_element_basic_type(this);
 9891     int opc = this->ideal_Opcode();
 9892     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9893                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9894   %}
 9895   ins_pipe( pipe_slow );
 9896 %}
 9897 
 9898 instruct vabs_masked(vec dst, kReg mask) %{
 9899   match(Set dst (AbsVB dst mask));
 9900   match(Set dst (AbsVS dst mask));
 9901   match(Set dst (AbsVI dst mask));
 9902   match(Set dst (AbsVL dst mask));
 9903   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9904   ins_encode %{
 9905     int vlen_enc = vector_length_encoding(this);
 9906     BasicType bt = Matcher::vector_element_basic_type(this);
 9907     int opc = this->ideal_Opcode();
 9908     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9909                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9910   %}
 9911   ins_pipe( pipe_slow );
 9912 %}
 9913 
 9914 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9915   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9916   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9917   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9918   ins_encode %{
 9919     assert(UseFMA, "Needs FMA instructions support.");
 9920     int vlen_enc = vector_length_encoding(this);
 9921     BasicType bt = Matcher::vector_element_basic_type(this);
 9922     int opc = this->ideal_Opcode();
 9923     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9924                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9925   %}
 9926   ins_pipe( pipe_slow );
 9927 %}
 9928 
 9929 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9930   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9931   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9932   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9933   ins_encode %{
 9934     assert(UseFMA, "Needs FMA instructions support.");
 9935     int vlen_enc = vector_length_encoding(this);
 9936     BasicType bt = Matcher::vector_element_basic_type(this);
 9937     int opc = this->ideal_Opcode();
 9938     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9939                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9940   %}
 9941   ins_pipe( pipe_slow );
 9942 %}
 9943 
 9944 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
 9945   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
 9946   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
 9947   ins_encode %{
 9948     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 9949     int vlen_enc = vector_length_encoding(this, $src1);
 9950     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9951 
 9952     // Comparison i
 9953     switch (src1_elem_bt) {
 9954       case T_BYTE: {
 9955         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9956         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9957         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9958         break;
 9959       }
 9960       case T_SHORT: {
 9961         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9962         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9963         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9964         break;
 9965       }
 9966       case T_INT: {
 9967         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9968         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9969         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9970         break;
 9971       }
 9972       case T_LONG: {
 9973         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9974         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9975         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9976         break;
 9977       }
 9978       case T_FLOAT: {
 9979         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9980         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9981         break;
 9982       }
 9983       case T_DOUBLE: {
 9984         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9985         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9986         break;
 9987       }
 9988       default: assert(false, "%s", type2name(src1_elem_bt)); break;
 9989     }
 9990   %}
 9991   ins_pipe( pipe_slow );
 9992 %}
 9993 
 9994 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
 9995   predicate(Matcher::vector_length(n) <= 32);
 9996   match(Set dst (MaskAll src));
 9997   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
 9998   ins_encode %{
 9999     int mask_len = Matcher::vector_length(this);
10000     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10001   %}
10002   ins_pipe( pipe_slow );
10003 %}
10004 
10005 #ifdef _LP64
10006 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10007   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10008   match(Set dst (XorVMask src (MaskAll cnt)));
10009   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10010   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10011   ins_encode %{
10012     uint masklen = Matcher::vector_length(this);
10013     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10014   %}
10015   ins_pipe( pipe_slow );
10016 %}
10017 
10018 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10019   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10020             (Matcher::vector_length(n) == 16) ||
10021             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10022   match(Set dst (XorVMask src (MaskAll cnt)));
10023   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10024   ins_encode %{
10025     uint masklen = Matcher::vector_length(this);
10026     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10027   %}
10028   ins_pipe( pipe_slow );
10029 %}
10030 
10031 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10032   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10033   match(Set dst (VectorLongToMask src));
10034   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10035   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10036   ins_encode %{
10037     int mask_len = Matcher::vector_length(this);
10038     int vec_enc  = vector_length_encoding(mask_len);
10039     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10040                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10041   %}
10042   ins_pipe( pipe_slow );
10043 %}
10044 
10045 
10046 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10047   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10048   match(Set dst (VectorLongToMask src));
10049   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10050   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10051   ins_encode %{
10052     int mask_len = Matcher::vector_length(this);
10053     assert(mask_len <= 32, "invalid mask length");
10054     int vec_enc  = vector_length_encoding(mask_len);
10055     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10056                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10057   %}
10058   ins_pipe( pipe_slow );
10059 %}
10060 
10061 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10062   predicate(n->bottom_type()->isa_vectmask());
10063   match(Set dst (VectorLongToMask src));
10064   format %{ "long_to_mask_evex $dst, $src\t!" %}
10065   ins_encode %{
10066     __ kmov($dst$$KRegister, $src$$Register);
10067   %}
10068   ins_pipe( pipe_slow );
10069 %}
10070 #endif
10071 
10072 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10073   match(Set dst (AndVMask src1 src2));
10074   match(Set dst (OrVMask src1 src2));
10075   match(Set dst (XorVMask src1 src2));
10076   effect(TEMP kscratch);
10077   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10078   ins_encode %{
10079     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10080     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10081     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10082     uint masklen = Matcher::vector_length(this);
10083     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10084     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10085   %}
10086   ins_pipe( pipe_slow );
10087 %}
10088 
10089 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10090   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10091   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10092   ins_encode %{
10093     int vlen_enc = vector_length_encoding(this);
10094     BasicType bt = Matcher::vector_element_basic_type(this);
10095     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10096                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10097   %}
10098   ins_pipe( pipe_slow );
10099 %}
10100 
10101 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10102   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10103   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10104   ins_encode %{
10105     int vlen_enc = vector_length_encoding(this);
10106     BasicType bt = Matcher::vector_element_basic_type(this);
10107     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10108                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10109   %}
10110   ins_pipe( pipe_slow );
10111 %}
10112 
10113 instruct castMM(kReg dst)
10114 %{
10115   match(Set dst (CastVV dst));
10116 
10117   size(0);
10118   format %{ "# castVV of $dst" %}
10119   ins_encode(/* empty encoding */);
10120   ins_cost(0);
10121   ins_pipe(empty);
10122 %}
10123 
10124 instruct castVV(vec dst)
10125 %{
10126   match(Set dst (CastVV dst));
10127 
10128   size(0);
10129   format %{ "# castVV of $dst" %}
10130   ins_encode(/* empty encoding */);
10131   ins_cost(0);
10132   ins_pipe(empty);
10133 %}
10134 
10135 instruct castVVLeg(legVec dst)
10136 %{
10137   match(Set dst (CastVV dst));
10138 
10139   size(0);
10140   format %{ "# castVV of $dst" %}
10141   ins_encode(/* empty encoding */);
10142   ins_cost(0);
10143   ins_pipe(empty);
10144 %}
10145 
10146 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10147 %{
10148   match(Set dst (IsInfiniteF src));
10149   effect(TEMP ktmp, KILL cr);
10150   format %{ "float_class_check $dst, $src" %}
10151   ins_encode %{
10152     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10153     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10154   %}
10155   ins_pipe(pipe_slow);
10156 %}
10157 
10158 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10159 %{
10160   match(Set dst (IsInfiniteD src));
10161   effect(TEMP ktmp, KILL cr);
10162   format %{ "double_class_check $dst, $src" %}
10163   ins_encode %{
10164     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10165     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10166   %}
10167   ins_pipe(pipe_slow);
10168 %}