1 //
    2 // Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM31 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   C2_MacroAssembler _masm(&cbuf);
 1314   address base = __ start_a_stub(size_exception_handler());
 1315   if (base == nullptr) {
 1316     ciEnv::current()->record_failure("CodeCache is full");
 1317     return 0;  // CodeBuffer::expand failed
 1318   }
 1319   int offset = __ offset();
 1320   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1321   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1322   __ end_a_stub();
 1323   return offset;
 1324 }
 1325 
 1326 // Emit deopt handler code.
 1327 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1328 
 1329   // Note that the code buffer's insts_mark is always relative to insts.
 1330   // That's why we must use the macroassembler to generate a handler.
 1331   C2_MacroAssembler _masm(&cbuf);
 1332   address base = __ start_a_stub(size_deopt_handler());
 1333   if (base == nullptr) {
 1334     ciEnv::current()->record_failure("CodeCache is full");
 1335     return 0;  // CodeBuffer::expand failed
 1336   }
 1337   int offset = __ offset();
 1338 
 1339 #ifdef _LP64
 1340   address the_pc = (address) __ pc();
 1341   Label next;
 1342   // push a "the_pc" on the stack without destroying any registers
 1343   // as they all may be live.
 1344 
 1345   // push address of "next"
 1346   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1347   __ bind(next);
 1348   // adjust it so it matches "the_pc"
 1349   __ subptr(Address(rsp, 0), __ offset() - offset);
 1350 #else
 1351   InternalAddress here(__ pc());
 1352   __ pushptr(here.addr(), noreg);
 1353 #endif
 1354 
 1355   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1356   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1357   __ end_a_stub();
 1358   return offset;
 1359 }
 1360 
 1361 Assembler::Width widthForType(BasicType bt) {
 1362   if (bt == T_BYTE) {
 1363     return Assembler::B;
 1364   } else if (bt == T_SHORT) {
 1365     return Assembler::W;
 1366   } else if (bt == T_INT) {
 1367     return Assembler::D;
 1368   } else {
 1369     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1370     return Assembler::Q;
 1371   }
 1372 }
 1373 
 1374 //=============================================================================
 1375 
 1376   // Float masks come from different places depending on platform.
 1377 #ifdef _LP64
 1378   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1379   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1380   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1381   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1382 #else
 1383   static address float_signmask()  { return (address)float_signmask_pool; }
 1384   static address float_signflip()  { return (address)float_signflip_pool; }
 1385   static address double_signmask() { return (address)double_signmask_pool; }
 1386   static address double_signflip() { return (address)double_signflip_pool; }
 1387 #endif
 1388   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1389   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1390   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1391   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1392   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1393   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1394   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1395   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1396   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1397   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1398   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1399   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1400   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1401   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1402   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1403 
 1404 //=============================================================================
 1405 bool Matcher::match_rule_supported(int opcode) {
 1406   if (!has_match_rule(opcode)) {
 1407     return false; // no match rule present
 1408   }
 1409   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1410   switch (opcode) {
 1411     case Op_AbsVL:
 1412     case Op_StoreVectorScatter:
 1413       if (UseAVX < 3) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountI:
 1418     case Op_PopCountL:
 1419       if (!UsePopCountInstruction) {
 1420         return false;
 1421       }
 1422       break;
 1423     case Op_PopCountVI:
 1424       if (UseAVX < 2) {
 1425         return false;
 1426       }
 1427       break;
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       if (!VM_Version::supports_cx8()) {
 1514         return false;
 1515       }
 1516       break;
 1517     case Op_StrIndexOf:
 1518       if (!UseSSE42Intrinsics) {
 1519         return false;
 1520       }
 1521       break;
 1522     case Op_StrIndexOfChar:
 1523       if (!UseSSE42Intrinsics) {
 1524         return false;
 1525       }
 1526       break;
 1527     case Op_OnSpinWait:
 1528       if (VM_Version::supports_on_spin_wait() == false) {
 1529         return false;
 1530       }
 1531       break;
 1532     case Op_MulVB:
 1533     case Op_LShiftVB:
 1534     case Op_RShiftVB:
 1535     case Op_URShiftVB:
 1536     case Op_VectorInsert:
 1537     case Op_VectorLoadMask:
 1538     case Op_VectorStoreMask:
 1539     case Op_VectorBlend:
 1540       if (UseSSE < 4) {
 1541         return false;
 1542       }
 1543       break;
 1544 #ifdef _LP64
 1545     case Op_MaxD:
 1546     case Op_MaxF:
 1547     case Op_MinD:
 1548     case Op_MinF:
 1549       if (UseAVX < 1) { // enabled for AVX only
 1550         return false;
 1551       }
 1552       break;
 1553 #endif
 1554     case Op_CacheWB:
 1555     case Op_CacheWBPreSync:
 1556     case Op_CacheWBPostSync:
 1557       if (!VM_Version::supports_data_cache_line_flush()) {
 1558         return false;
 1559       }
 1560       break;
 1561     case Op_ExtractB:
 1562     case Op_ExtractL:
 1563     case Op_ExtractI:
 1564     case Op_RoundDoubleMode:
 1565       if (UseSSE < 4) {
 1566         return false;
 1567       }
 1568       break;
 1569     case Op_RoundDoubleModeV:
 1570       if (VM_Version::supports_avx() == false) {
 1571         return false; // 128bit vroundpd is not available
 1572       }
 1573       break;
 1574     case Op_LoadVectorGather:
 1575       if (UseAVX < 2) {
 1576         return false;
 1577       }
 1578       break;
 1579     case Op_FmaF:
 1580     case Op_FmaD:
 1581     case Op_FmaVD:
 1582     case Op_FmaVF:
 1583       if (!UseFMA) {
 1584         return false;
 1585       }
 1586       break;
 1587     case Op_MacroLogicV:
 1588       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1589         return false;
 1590       }
 1591       break;
 1592 
 1593     case Op_VectorCmpMasked:
 1594     case Op_VectorMaskGen:
 1595       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1596         return false;
 1597       }
 1598       break;
 1599     case Op_VectorMaskFirstTrue:
 1600     case Op_VectorMaskLastTrue:
 1601     case Op_VectorMaskTrueCount:
 1602     case Op_VectorMaskToLong:
 1603       if (!is_LP64 || UseAVX < 1) {
 1604          return false;
 1605       }
 1606       break;
 1607     case Op_RoundF:
 1608     case Op_RoundD:
 1609       if (!is_LP64) {
 1610         return false;
 1611       }
 1612       break;
 1613     case Op_CopySignD:
 1614     case Op_CopySignF:
 1615       if (UseAVX < 3 || !is_LP64)  {
 1616         return false;
 1617       }
 1618       if (!VM_Version::supports_avx512vl()) {
 1619         return false;
 1620       }
 1621       break;
 1622 #ifndef _LP64
 1623     case Op_AddReductionVF:
 1624     case Op_AddReductionVD:
 1625     case Op_MulReductionVF:
 1626     case Op_MulReductionVD:
 1627       if (UseSSE < 1) { // requires at least SSE
 1628         return false;
 1629       }
 1630       break;
 1631     case Op_MulAddVS2VI:
 1632     case Op_RShiftVL:
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if (UseSSE < 2) {
 1636         return false;
 1637       }
 1638       break;
 1639 #endif // !LP64
 1640     case Op_CompressBits:
 1641       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1642         return false;
 1643       }
 1644       break;
 1645     case Op_ExpandBits:
 1646       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_SignumF:
 1651       if (UseSSE < 1) {
 1652         return false;
 1653       }
 1654       break;
 1655     case Op_SignumD:
 1656       if (UseSSE < 2) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_CompressM:
 1661       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1662         return false;
 1663       }
 1664       break;
 1665     case Op_CompressV:
 1666     case Op_ExpandV:
 1667       if (!VM_Version::supports_avx512vl()) {
 1668         return false;
 1669       }
 1670       break;
 1671     case Op_SqrtF:
 1672       if (UseSSE < 1) {
 1673         return false;
 1674       }
 1675       break;
 1676     case Op_SqrtD:
 1677 #ifdef _LP64
 1678       if (UseSSE < 2) {
 1679         return false;
 1680       }
 1681 #else
 1682       // x86_32.ad has a special match rule for SqrtD.
 1683       // Together with common x86 rules, this handles all UseSSE cases.
 1684 #endif
 1685       break;
 1686     case Op_ConvF2HF:
 1687     case Op_ConvHF2F:
 1688       if (!VM_Version::supports_float16()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_VectorCastF2HF:
 1693     case Op_VectorCastHF2F:
 1694       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1695         return false;
 1696       }
 1697       break;
 1698   }
 1699   return true;  // Match rules are supported by default.
 1700 }
 1701 
 1702 //------------------------------------------------------------------------
 1703 
 1704 static inline bool is_pop_count_instr_target(BasicType bt) {
 1705   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1706          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1707 }
 1708 
 1709 bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) {
 1710   return match_rule_supported_vector(opcode, vlen, bt);
 1711 }
 1712 
 1713 // Identify extra cases that we might want to provide match rules for vector nodes and
 1714 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1715 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1716   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1717   if (!match_rule_supported(opcode)) {
 1718     return false;
 1719   }
 1720   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1721   //   * SSE2 supports 128bit vectors for all types;
 1722   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1723   //   * AVX2 supports 256bit vectors for all types;
 1724   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1725   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1726   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1727   // And MaxVectorSize is taken into account as well.
 1728   if (!vector_size_supported(bt, vlen)) {
 1729     return false;
 1730   }
 1731   // Special cases which require vector length follow:
 1732   //   * implementation limitations
 1733   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1734   //   * 128bit vroundpd instruction is present only in AVX1
 1735   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1736   switch (opcode) {
 1737     case Op_AbsVF:
 1738     case Op_NegVF:
 1739       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1740         return false; // 512bit vandps and vxorps are not available
 1741       }
 1742       break;
 1743     case Op_AbsVD:
 1744     case Op_NegVD:
 1745       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1746         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1747       }
 1748       break;
 1749     case Op_RotateRightV:
 1750     case Op_RotateLeftV:
 1751       if (bt != T_INT && bt != T_LONG) {
 1752         return false;
 1753       } // fallthrough
 1754     case Op_MacroLogicV:
 1755       if (!VM_Version::supports_evex() ||
 1756           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1757         return false;
 1758       }
 1759       break;
 1760     case Op_ClearArray:
 1761     case Op_VectorMaskGen:
 1762     case Op_VectorCmpMasked:
 1763       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1764         return false;
 1765       }
 1766       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1767         return false;
 1768       }
 1769       break;
 1770     case Op_LoadVectorMasked:
 1771     case Op_StoreVectorMasked:
 1772       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1773         return false;
 1774       }
 1775       break;
 1776     case Op_MaxV:
 1777     case Op_MinV:
 1778       if (UseSSE < 4 && is_integral_type(bt)) {
 1779         return false;
 1780       }
 1781       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1782           // Float/Double intrinsics are enabled for AVX family currently.
 1783           if (UseAVX == 0) {
 1784             return false;
 1785           }
 1786           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1787             return false;
 1788           }
 1789       }
 1790       break;
 1791     case Op_CallLeafVector:
 1792       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1793         return false;
 1794       }
 1795       break;
 1796     case Op_AddReductionVI:
 1797       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1798         return false;
 1799       }
 1800       // fallthrough
 1801     case Op_AndReductionV:
 1802     case Op_OrReductionV:
 1803     case Op_XorReductionV:
 1804       if (is_subword_type(bt) && (UseSSE < 4)) {
 1805         return false;
 1806       }
 1807 #ifndef _LP64
 1808       if (bt == T_BYTE || bt == T_LONG) {
 1809         return false;
 1810       }
 1811 #endif
 1812       break;
 1813 #ifndef _LP64
 1814     case Op_VectorInsert:
 1815       if (bt == T_LONG || bt == T_DOUBLE) {
 1816         return false;
 1817       }
 1818       break;
 1819 #endif
 1820     case Op_MinReductionV:
 1821     case Op_MaxReductionV:
 1822       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1823         return false;
 1824       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1825         return false;
 1826       }
 1827       // Float/Double intrinsics enabled for AVX family.
 1828       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1829         return false;
 1830       }
 1831       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1832         return false;
 1833       }
 1834 #ifndef _LP64
 1835       if (bt == T_BYTE || bt == T_LONG) {
 1836         return false;
 1837       }
 1838 #endif
 1839       break;
 1840     case Op_VectorTest:
 1841       if (UseSSE < 4) {
 1842         return false; // Implementation limitation
 1843       } else if (size_in_bits < 32) {
 1844         return false; // Implementation limitation
 1845       }
 1846       break;
 1847     case Op_VectorLoadShuffle:
 1848     case Op_VectorRearrange:
 1849       if(vlen == 2) {
 1850         return false; // Implementation limitation due to how shuffle is loaded
 1851       } else if (size_in_bits == 256 && UseAVX < 2) {
 1852         return false; // Implementation limitation
 1853       }
 1854       break;
 1855     case Op_VectorLoadMask:
 1856     case Op_VectorMaskCast:
 1857       if (size_in_bits == 256 && UseAVX < 2) {
 1858         return false; // Implementation limitation
 1859       }
 1860       // fallthrough
 1861     case Op_VectorStoreMask:
 1862       if (vlen == 2) {
 1863         return false; // Implementation limitation
 1864       }
 1865       break;
 1866     case Op_PopulateIndex:
 1867       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1868         return false;
 1869       }
 1870       break;
 1871     case Op_VectorCastB2X:
 1872     case Op_VectorCastS2X:
 1873     case Op_VectorCastI2X:
 1874       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1875         return false;
 1876       }
 1877       break;
 1878     case Op_VectorCastL2X:
 1879       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1880         return false;
 1881       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1882         return false;
 1883       }
 1884       break;
 1885     case Op_VectorCastF2X: {
 1886         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1887         // happen after intermediate conversion to integer and special handling
 1888         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1889         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1890         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1891           return false;
 1892         }
 1893       }
 1894       // fallthrough
 1895     case Op_VectorCastD2X:
 1896       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1897         return false;
 1898       }
 1899       break;
 1900     case Op_VectorCastF2HF:
 1901     case Op_VectorCastHF2F:
 1902       if (!VM_Version::supports_f16c() &&
 1903          ((!VM_Version::supports_evex() ||
 1904          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1905         return false;
 1906       }
 1907       break;
 1908     case Op_RoundVD:
 1909       if (!VM_Version::supports_avx512dq()) {
 1910         return false;
 1911       }
 1912       break;
 1913     case Op_MulReductionVI:
 1914       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1915         return false;
 1916       }
 1917       break;
 1918     case Op_LoadVectorGatherMasked:
 1919     case Op_StoreVectorScatterMasked:
 1920     case Op_StoreVectorScatter:
 1921       if (is_subword_type(bt)) {
 1922         return false;
 1923       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1924         return false;
 1925       }
 1926       // fallthrough
 1927     case Op_LoadVectorGather:
 1928       if (size_in_bits == 64 ) {
 1929         return false;
 1930       }
 1931       break;
 1932     case Op_MaskAll:
 1933       if (!VM_Version::supports_evex()) {
 1934         return false;
 1935       }
 1936       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1937         return false;
 1938       }
 1939       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1940         return false;
 1941       }
 1942       break;
 1943     case Op_VectorMaskCmp:
 1944       if (vlen < 2 || size_in_bits < 32) {
 1945         return false;
 1946       }
 1947       break;
 1948     case Op_CompressM:
 1949       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1950         return false;
 1951       }
 1952       break;
 1953     case Op_CompressV:
 1954     case Op_ExpandV:
 1955       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1956         return false;
 1957       }
 1958       if (size_in_bits < 128 ) {
 1959         return false;
 1960       }
 1961       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1962         return false;
 1963       }
 1964       break;
 1965     case Op_VectorLongToMask:
 1966       if (UseAVX < 1 || !is_LP64) {
 1967         return false;
 1968       }
 1969       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1970         return false;
 1971       }
 1972       break;
 1973     case Op_SignumVD:
 1974     case Op_SignumVF:
 1975       if (UseAVX < 1) {
 1976         return false;
 1977       }
 1978       break;
 1979     case Op_PopCountVI:
 1980     case Op_PopCountVL: {
 1981         if (!is_pop_count_instr_target(bt) &&
 1982             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1983           return false;
 1984         }
 1985       }
 1986       break;
 1987     case Op_ReverseV:
 1988     case Op_ReverseBytesV:
 1989       if (UseAVX < 2) {
 1990         return false;
 1991       }
 1992       break;
 1993     case Op_CountTrailingZerosV:
 1994     case Op_CountLeadingZerosV:
 1995       if (UseAVX < 2) {
 1996         return false;
 1997       }
 1998       break;
 1999   }
 2000   return true;  // Per default match rules are supported.
 2001 }
 2002 
 2003 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2004   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2005   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2006   // of their non-masked counterpart with mask edge being the differentiator.
 2007   // This routine does a strict check on the existence of masked operation patterns
 2008   // by returning a default false value for all the other opcodes apart from the
 2009   // ones whose masked instruction patterns are defined in this file.
 2010   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2011     return false;
 2012   }
 2013 
 2014   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2015   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2016   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2017     return false;
 2018   }
 2019   switch(opcode) {
 2020     // Unary masked operations
 2021     case Op_AbsVB:
 2022     case Op_AbsVS:
 2023       if(!VM_Version::supports_avx512bw()) {
 2024         return false;  // Implementation limitation
 2025       }
 2026     case Op_AbsVI:
 2027     case Op_AbsVL:
 2028       return true;
 2029 
 2030     // Ternary masked operations
 2031     case Op_FmaVF:
 2032     case Op_FmaVD:
 2033       return true;
 2034 
 2035     case Op_MacroLogicV:
 2036       if(bt != T_INT && bt != T_LONG) {
 2037         return false;
 2038       }
 2039       return true;
 2040 
 2041     // Binary masked operations
 2042     case Op_AddVB:
 2043     case Op_AddVS:
 2044     case Op_SubVB:
 2045     case Op_SubVS:
 2046     case Op_MulVS:
 2047     case Op_LShiftVS:
 2048     case Op_RShiftVS:
 2049     case Op_URShiftVS:
 2050       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2051       if (!VM_Version::supports_avx512bw()) {
 2052         return false;  // Implementation limitation
 2053       }
 2054       return true;
 2055 
 2056     case Op_MulVL:
 2057       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2058       if (!VM_Version::supports_avx512dq()) {
 2059         return false;  // Implementation limitation
 2060       }
 2061       return true;
 2062 
 2063     case Op_AndV:
 2064     case Op_OrV:
 2065     case Op_XorV:
 2066     case Op_RotateRightV:
 2067     case Op_RotateLeftV:
 2068       if (bt != T_INT && bt != T_LONG) {
 2069         return false; // Implementation limitation
 2070       }
 2071       return true;
 2072 
 2073     case Op_VectorLoadMask:
 2074       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2075       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2076         return false;
 2077       }
 2078       return true;
 2079 
 2080     case Op_AddVI:
 2081     case Op_AddVL:
 2082     case Op_AddVF:
 2083     case Op_AddVD:
 2084     case Op_SubVI:
 2085     case Op_SubVL:
 2086     case Op_SubVF:
 2087     case Op_SubVD:
 2088     case Op_MulVI:
 2089     case Op_MulVF:
 2090     case Op_MulVD:
 2091     case Op_DivVF:
 2092     case Op_DivVD:
 2093     case Op_SqrtVF:
 2094     case Op_SqrtVD:
 2095     case Op_LShiftVI:
 2096     case Op_LShiftVL:
 2097     case Op_RShiftVI:
 2098     case Op_RShiftVL:
 2099     case Op_URShiftVI:
 2100     case Op_URShiftVL:
 2101     case Op_LoadVectorMasked:
 2102     case Op_StoreVectorMasked:
 2103     case Op_LoadVectorGatherMasked:
 2104     case Op_StoreVectorScatterMasked:
 2105       return true;
 2106 
 2107     case Op_MaxV:
 2108     case Op_MinV:
 2109       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2110         return false; // Implementation limitation
 2111       }
 2112       if (is_floating_point_type(bt)) {
 2113         return false; // Implementation limitation
 2114       }
 2115       return true;
 2116 
 2117     case Op_VectorMaskCmp:
 2118       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2119         return false; // Implementation limitation
 2120       }
 2121       return true;
 2122 
 2123     case Op_VectorRearrange:
 2124       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2125         return false; // Implementation limitation
 2126       }
 2127       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2128         return false; // Implementation limitation
 2129       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2130         return false; // Implementation limitation
 2131       }
 2132       return true;
 2133 
 2134     // Binary Logical operations
 2135     case Op_AndVMask:
 2136     case Op_OrVMask:
 2137     case Op_XorVMask:
 2138       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2139         return false; // Implementation limitation
 2140       }
 2141       return true;
 2142 
 2143     case Op_PopCountVI:
 2144     case Op_PopCountVL:
 2145       if (!is_pop_count_instr_target(bt)) {
 2146         return false;
 2147       }
 2148       return true;
 2149 
 2150     case Op_MaskAll:
 2151       return true;
 2152 
 2153     case Op_CountLeadingZerosV:
 2154       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2155         return true;
 2156       }
 2157     default:
 2158       return false;
 2159   }
 2160 }
 2161 
 2162 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2163   return false;
 2164 }
 2165 
 2166 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2167   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2168   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2169   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2170       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2171     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2172     return new legVecZOper();
 2173   }
 2174   if (legacy) {
 2175     switch (ideal_reg) {
 2176       case Op_VecS: return new legVecSOper();
 2177       case Op_VecD: return new legVecDOper();
 2178       case Op_VecX: return new legVecXOper();
 2179       case Op_VecY: return new legVecYOper();
 2180       case Op_VecZ: return new legVecZOper();
 2181     }
 2182   } else {
 2183     switch (ideal_reg) {
 2184       case Op_VecS: return new vecSOper();
 2185       case Op_VecD: return new vecDOper();
 2186       case Op_VecX: return new vecXOper();
 2187       case Op_VecY: return new vecYOper();
 2188       case Op_VecZ: return new vecZOper();
 2189     }
 2190   }
 2191   ShouldNotReachHere();
 2192   return nullptr;
 2193 }
 2194 
 2195 bool Matcher::is_reg2reg_move(MachNode* m) {
 2196   switch (m->rule()) {
 2197     case MoveVec2Leg_rule:
 2198     case MoveLeg2Vec_rule:
 2199     case MoveF2VL_rule:
 2200     case MoveF2LEG_rule:
 2201     case MoveVL2F_rule:
 2202     case MoveLEG2F_rule:
 2203     case MoveD2VL_rule:
 2204     case MoveD2LEG_rule:
 2205     case MoveVL2D_rule:
 2206     case MoveLEG2D_rule:
 2207       return true;
 2208     default:
 2209       return false;
 2210   }
 2211 }
 2212 
 2213 bool Matcher::is_generic_vector(MachOper* opnd) {
 2214   switch (opnd->opcode()) {
 2215     case VEC:
 2216     case LEGVEC:
 2217       return true;
 2218     default:
 2219       return false;
 2220   }
 2221 }
 2222 
 2223 //------------------------------------------------------------------------
 2224 
 2225 const RegMask* Matcher::predicate_reg_mask(void) {
 2226   return &_VECTMASK_REG_mask;
 2227 }
 2228 
 2229 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2230   return new TypeVectMask(elemTy, length);
 2231 }
 2232 
 2233 // Max vector size in bytes. 0 if not supported.
 2234 int Matcher::vector_width_in_bytes(BasicType bt) {
 2235   assert(is_java_primitive(bt), "only primitive type vectors");
 2236   if (UseSSE < 2) return 0;
 2237   // SSE2 supports 128bit vectors for all types.
 2238   // AVX2 supports 256bit vectors for all types.
 2239   // AVX2/EVEX supports 512bit vectors for all types.
 2240   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2241   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2242   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2243     size = (UseAVX > 2) ? 64 : 32;
 2244   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2245     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2246   // Use flag to limit vector size.
 2247   size = MIN2(size,(int)MaxVectorSize);
 2248   // Minimum 2 values in vector (or 4 for bytes).
 2249   switch (bt) {
 2250   case T_DOUBLE:
 2251   case T_LONG:
 2252     if (size < 16) return 0;
 2253     break;
 2254   case T_FLOAT:
 2255   case T_INT:
 2256     if (size < 8) return 0;
 2257     break;
 2258   case T_BOOLEAN:
 2259     if (size < 4) return 0;
 2260     break;
 2261   case T_CHAR:
 2262     if (size < 4) return 0;
 2263     break;
 2264   case T_BYTE:
 2265     if (size < 4) return 0;
 2266     break;
 2267   case T_SHORT:
 2268     if (size < 4) return 0;
 2269     break;
 2270   default:
 2271     ShouldNotReachHere();
 2272   }
 2273   return size;
 2274 }
 2275 
 2276 // Limits on vector size (number of elements) loaded into vector.
 2277 int Matcher::max_vector_size(const BasicType bt) {
 2278   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2279 }
 2280 int Matcher::min_vector_size(const BasicType bt) {
 2281   int max_size = max_vector_size(bt);
 2282   // Min size which can be loaded into vector is 4 bytes.
 2283   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2284   // Support for calling svml double64 vectors
 2285   if (bt == T_DOUBLE) {
 2286     size = 1;
 2287   }
 2288   return MIN2(size,max_size);
 2289 }
 2290 
 2291 int Matcher::superword_max_vector_size(const BasicType bt) {
 2292   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2293   // by default on Cascade Lake
 2294   if (VM_Version::is_default_intel_cascade_lake()) {
 2295     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2296   }
 2297   return Matcher::max_vector_size(bt);
 2298 }
 2299 
 2300 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2301   return -1;
 2302 }
 2303 
 2304 // Vector ideal reg corresponding to specified size in bytes
 2305 uint Matcher::vector_ideal_reg(int size) {
 2306   assert(MaxVectorSize >= size, "");
 2307   switch(size) {
 2308     case  4: return Op_VecS;
 2309     case  8: return Op_VecD;
 2310     case 16: return Op_VecX;
 2311     case 32: return Op_VecY;
 2312     case 64: return Op_VecZ;
 2313   }
 2314   ShouldNotReachHere();
 2315   return 0;
 2316 }
 2317 
 2318 // Check for shift by small constant as well
 2319 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2320   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2321       shift->in(2)->get_int() <= 3 &&
 2322       // Are there other uses besides address expressions?
 2323       !matcher->is_visited(shift)) {
 2324     address_visited.set(shift->_idx); // Flag as address_visited
 2325     mstack.push(shift->in(2), Matcher::Visit);
 2326     Node *conv = shift->in(1);
 2327 #ifdef _LP64
 2328     // Allow Matcher to match the rule which bypass
 2329     // ConvI2L operation for an array index on LP64
 2330     // if the index value is positive.
 2331     if (conv->Opcode() == Op_ConvI2L &&
 2332         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2333         // Are there other uses besides address expressions?
 2334         !matcher->is_visited(conv)) {
 2335       address_visited.set(conv->_idx); // Flag as address_visited
 2336       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2337     } else
 2338 #endif
 2339       mstack.push(conv, Matcher::Pre_Visit);
 2340     return true;
 2341   }
 2342   return false;
 2343 }
 2344 
 2345 // This function identifies sub-graphs in which a 'load' node is
 2346 // input to two different nodes, and such that it can be matched
 2347 // with BMI instructions like blsi, blsr, etc.
 2348 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2349 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2350 // refers to the same node.
 2351 //
 2352 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2353 // This is a temporary solution until we make DAGs expressible in ADL.
 2354 template<typename ConType>
 2355 class FusedPatternMatcher {
 2356   Node* _op1_node;
 2357   Node* _mop_node;
 2358   int _con_op;
 2359 
 2360   static int match_next(Node* n, int next_op, int next_op_idx) {
 2361     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2362       return -1;
 2363     }
 2364 
 2365     if (next_op_idx == -1) { // n is commutative, try rotations
 2366       if (n->in(1)->Opcode() == next_op) {
 2367         return 1;
 2368       } else if (n->in(2)->Opcode() == next_op) {
 2369         return 2;
 2370       }
 2371     } else {
 2372       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2373       if (n->in(next_op_idx)->Opcode() == next_op) {
 2374         return next_op_idx;
 2375       }
 2376     }
 2377     return -1;
 2378   }
 2379 
 2380  public:
 2381   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2382     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2383 
 2384   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2385              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2386              typename ConType::NativeType con_value) {
 2387     if (_op1_node->Opcode() != op1) {
 2388       return false;
 2389     }
 2390     if (_mop_node->outcnt() > 2) {
 2391       return false;
 2392     }
 2393     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2394     if (op1_op2_idx == -1) {
 2395       return false;
 2396     }
 2397     // Memory operation must be the other edge
 2398     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2399 
 2400     // Check that the mop node is really what we want
 2401     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2402       Node* op2_node = _op1_node->in(op1_op2_idx);
 2403       if (op2_node->outcnt() > 1) {
 2404         return false;
 2405       }
 2406       assert(op2_node->Opcode() == op2, "Should be");
 2407       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2408       if (op2_con_idx == -1) {
 2409         return false;
 2410       }
 2411       // Memory operation must be the other edge
 2412       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2413       // Check that the memory operation is the same node
 2414       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2415         // Now check the constant
 2416         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2417         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2418           return true;
 2419         }
 2420       }
 2421     }
 2422     return false;
 2423   }
 2424 };
 2425 
 2426 static bool is_bmi_pattern(Node* n, Node* m) {
 2427   assert(UseBMI1Instructions, "sanity");
 2428   if (n != nullptr && m != nullptr) {
 2429     if (m->Opcode() == Op_LoadI) {
 2430       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2431       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2432              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2433              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2434     } else if (m->Opcode() == Op_LoadL) {
 2435       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2436       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2437              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2438              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2439     }
 2440   }
 2441   return false;
 2442 }
 2443 
 2444 // Should the matcher clone input 'm' of node 'n'?
 2445 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2446   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2447   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2448     mstack.push(m, Visit);
 2449     return true;
 2450   }
 2451   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2452     mstack.push(m, Visit);           // m = ShiftCntV
 2453     return true;
 2454   }
 2455   return false;
 2456 }
 2457 
 2458 // Should the Matcher clone shifts on addressing modes, expecting them
 2459 // to be subsumed into complex addressing expressions or compute them
 2460 // into registers?
 2461 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2462   Node *off = m->in(AddPNode::Offset);
 2463   if (off->is_Con()) {
 2464     address_visited.test_set(m->_idx); // Flag as address_visited
 2465     Node *adr = m->in(AddPNode::Address);
 2466 
 2467     // Intel can handle 2 adds in addressing mode
 2468     // AtomicAdd is not an addressing expression.
 2469     // Cheap to find it by looking for screwy base.
 2470     if (adr->is_AddP() &&
 2471         !adr->in(AddPNode::Base)->is_top() &&
 2472         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2473         // Are there other uses besides address expressions?
 2474         !is_visited(adr)) {
 2475       address_visited.set(adr->_idx); // Flag as address_visited
 2476       Node *shift = adr->in(AddPNode::Offset);
 2477       if (!clone_shift(shift, this, mstack, address_visited)) {
 2478         mstack.push(shift, Pre_Visit);
 2479       }
 2480       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2481       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2482     } else {
 2483       mstack.push(adr, Pre_Visit);
 2484     }
 2485 
 2486     // Clone X+offset as it also folds into most addressing expressions
 2487     mstack.push(off, Visit);
 2488     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2489     return true;
 2490   } else if (clone_shift(off, this, mstack, address_visited)) {
 2491     address_visited.test_set(m->_idx); // Flag as address_visited
 2492     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2493     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2494     return true;
 2495   }
 2496   return false;
 2497 }
 2498 
 2499 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2500   switch (bt) {
 2501     case BoolTest::eq:
 2502       return Assembler::eq;
 2503     case BoolTest::ne:
 2504       return Assembler::neq;
 2505     case BoolTest::le:
 2506     case BoolTest::ule:
 2507       return Assembler::le;
 2508     case BoolTest::ge:
 2509     case BoolTest::uge:
 2510       return Assembler::nlt;
 2511     case BoolTest::lt:
 2512     case BoolTest::ult:
 2513       return Assembler::lt;
 2514     case BoolTest::gt:
 2515     case BoolTest::ugt:
 2516       return Assembler::nle;
 2517     default : ShouldNotReachHere(); return Assembler::_false;
 2518   }
 2519 }
 2520 
 2521 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2522   switch (bt) {
 2523   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2524   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2525   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2526   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2527   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2528   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2529   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2530   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2531   }
 2532 }
 2533 
 2534 // Helper methods for MachSpillCopyNode::implementation().
 2535 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2536                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2537   assert(ireg == Op_VecS || // 32bit vector
 2538          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2539          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2540          "no non-adjacent vector moves" );
 2541   if (cbuf) {
 2542     C2_MacroAssembler _masm(cbuf);
 2543     switch (ireg) {
 2544     case Op_VecS: // copy whole register
 2545     case Op_VecD:
 2546     case Op_VecX:
 2547 #ifndef _LP64
 2548       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2549 #else
 2550       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2551         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2552       } else {
 2553         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2554      }
 2555 #endif
 2556       break;
 2557     case Op_VecY:
 2558 #ifndef _LP64
 2559       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2560 #else
 2561       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2562         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2563       } else {
 2564         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2565      }
 2566 #endif
 2567       break;
 2568     case Op_VecZ:
 2569       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2570       break;
 2571     default:
 2572       ShouldNotReachHere();
 2573     }
 2574 #ifndef PRODUCT
 2575   } else {
 2576     switch (ireg) {
 2577     case Op_VecS:
 2578     case Op_VecD:
 2579     case Op_VecX:
 2580       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2581       break;
 2582     case Op_VecY:
 2583     case Op_VecZ:
 2584       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2585       break;
 2586     default:
 2587       ShouldNotReachHere();
 2588     }
 2589 #endif
 2590   }
 2591 }
 2592 
 2593 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2594                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2595   if (cbuf) {
 2596     C2_MacroAssembler _masm(cbuf);
 2597     if (is_load) {
 2598       switch (ireg) {
 2599       case Op_VecS:
 2600         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2601         break;
 2602       case Op_VecD:
 2603         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2604         break;
 2605       case Op_VecX:
 2606 #ifndef _LP64
 2607         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2608 #else
 2609         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2610           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2611         } else {
 2612           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2613           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2614         }
 2615 #endif
 2616         break;
 2617       case Op_VecY:
 2618 #ifndef _LP64
 2619         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2620 #else
 2621         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2622           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2623         } else {
 2624           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2625           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2626         }
 2627 #endif
 2628         break;
 2629       case Op_VecZ:
 2630         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2631         break;
 2632       default:
 2633         ShouldNotReachHere();
 2634       }
 2635     } else { // store
 2636       switch (ireg) {
 2637       case Op_VecS:
 2638         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2639         break;
 2640       case Op_VecD:
 2641         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2642         break;
 2643       case Op_VecX:
 2644 #ifndef _LP64
 2645         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2646 #else
 2647         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2648           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2649         }
 2650         else {
 2651           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2652         }
 2653 #endif
 2654         break;
 2655       case Op_VecY:
 2656 #ifndef _LP64
 2657         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2658 #else
 2659         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2660           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2661         }
 2662         else {
 2663           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2664         }
 2665 #endif
 2666         break;
 2667       case Op_VecZ:
 2668         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2669         break;
 2670       default:
 2671         ShouldNotReachHere();
 2672       }
 2673     }
 2674 #ifndef PRODUCT
 2675   } else {
 2676     if (is_load) {
 2677       switch (ireg) {
 2678       case Op_VecS:
 2679         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2680         break;
 2681       case Op_VecD:
 2682         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2683         break;
 2684        case Op_VecX:
 2685         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2686         break;
 2687       case Op_VecY:
 2688       case Op_VecZ:
 2689         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2690         break;
 2691       default:
 2692         ShouldNotReachHere();
 2693       }
 2694     } else { // store
 2695       switch (ireg) {
 2696       case Op_VecS:
 2697         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2698         break;
 2699       case Op_VecD:
 2700         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2701         break;
 2702        case Op_VecX:
 2703         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2704         break;
 2705       case Op_VecY:
 2706       case Op_VecZ:
 2707         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2708         break;
 2709       default:
 2710         ShouldNotReachHere();
 2711       }
 2712     }
 2713 #endif
 2714   }
 2715 }
 2716 
 2717 template <class T>
 2718 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2719   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2720   jvalue ele;
 2721   switch (bt) {
 2722     case T_BYTE:   ele.b = con; break;
 2723     case T_SHORT:  ele.s = con; break;
 2724     case T_INT:    ele.i = con; break;
 2725     case T_LONG:   ele.j = con; break;
 2726     case T_FLOAT:  ele.f = con; break;
 2727     case T_DOUBLE: ele.d = con; break;
 2728     default: ShouldNotReachHere();
 2729   }
 2730   for (int i = 0; i < len; i++) {
 2731     val->append(ele);
 2732   }
 2733   return val;
 2734 }
 2735 
 2736 static inline jlong high_bit_set(BasicType bt) {
 2737   switch (bt) {
 2738     case T_BYTE:  return 0x8080808080808080;
 2739     case T_SHORT: return 0x8000800080008000;
 2740     case T_INT:   return 0x8000000080000000;
 2741     case T_LONG:  return 0x8000000000000000;
 2742     default:
 2743       ShouldNotReachHere();
 2744       return 0;
 2745   }
 2746 }
 2747 
 2748 #ifndef PRODUCT
 2749   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2750     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2751   }
 2752 #endif
 2753 
 2754   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2755     C2_MacroAssembler _masm(&cbuf);
 2756     __ nop(_count);
 2757   }
 2758 
 2759   uint MachNopNode::size(PhaseRegAlloc*) const {
 2760     return _count;
 2761   }
 2762 
 2763 #ifndef PRODUCT
 2764   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2765     st->print("# breakpoint");
 2766   }
 2767 #endif
 2768 
 2769   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2770     C2_MacroAssembler _masm(&cbuf);
 2771     __ int3();
 2772   }
 2773 
 2774   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2775     return MachNode::size(ra_);
 2776   }
 2777 
 2778 %}
 2779 
 2780 encode %{
 2781 
 2782   enc_class call_epilog %{
 2783     C2_MacroAssembler _masm(&cbuf);
 2784     if (VerifyStackAtCalls) {
 2785       // Check that stack depth is unchanged: find majik cookie on stack
 2786       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2787       Label L;
 2788       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2789       __ jccb(Assembler::equal, L);
 2790       // Die if stack mismatch
 2791       __ int3();
 2792       __ bind(L);
 2793     }
 2794     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2795       C2_MacroAssembler _masm(&cbuf);
 2796       if (!_method->signature()->returns_null_free_inline_type()) {
 2797         // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2798         // Search for the corresponding projection, get the register and emit code that initialized it.
 2799         uint con = (tf()->range_cc()->cnt() - 1);
 2800         for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2801           ProjNode* proj = fast_out(i)->as_Proj();
 2802           if (proj->_con == con) {
 2803             // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2804             OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2805             VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2806             Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2807             __ testq(rax, rax);
 2808             __ setb(Assembler::notZero, toReg);
 2809             __ movzbl(toReg, toReg);
 2810             if (reg->is_stack()) {
 2811               int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2812               __ movq(Address(rsp, st_off), toReg);
 2813             }
 2814             break;
 2815           }
 2816         }
 2817       }
 2818       if (return_value_is_used()) {
 2819         // An inline type is returned as fields in multiple registers.
 2820         // Rax either contains an oop if the inline type is buffered or a pointer
 2821         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2822         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2823         // rax &= (rax & 1) - 1
 2824         __ movptr(rscratch1, rax);
 2825         __ andptr(rscratch1, 0x1);
 2826         __ subptr(rscratch1, 0x1);
 2827         __ andptr(rax, rscratch1);
 2828       }
 2829     }
 2830   %}
 2831 
 2832 %}
 2833 
 2834 // Operands for bound floating pointer register arguments
 2835 operand rxmm0() %{
 2836   constraint(ALLOC_IN_RC(xmm0_reg));
 2837   match(VecX);
 2838   format%{%}
 2839   interface(REG_INTER);
 2840 %}
 2841 
 2842 //----------OPERANDS-----------------------------------------------------------
 2843 // Operand definitions must precede instruction definitions for correct parsing
 2844 // in the ADLC because operands constitute user defined types which are used in
 2845 // instruction definitions.
 2846 
 2847 // Vectors
 2848 
 2849 // Dummy generic vector class. Should be used for all vector operands.
 2850 // Replaced with vec[SDXYZ] during post-selection pass.
 2851 operand vec() %{
 2852   constraint(ALLOC_IN_RC(dynamic));
 2853   match(VecX);
 2854   match(VecY);
 2855   match(VecZ);
 2856   match(VecS);
 2857   match(VecD);
 2858 
 2859   format %{ %}
 2860   interface(REG_INTER);
 2861 %}
 2862 
 2863 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2864 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2865 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2866 // runtime code generation via reg_class_dynamic.
 2867 operand legVec() %{
 2868   constraint(ALLOC_IN_RC(dynamic));
 2869   match(VecX);
 2870   match(VecY);
 2871   match(VecZ);
 2872   match(VecS);
 2873   match(VecD);
 2874 
 2875   format %{ %}
 2876   interface(REG_INTER);
 2877 %}
 2878 
 2879 // Replaces vec during post-selection cleanup. See above.
 2880 operand vecS() %{
 2881   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2882   match(VecS);
 2883 
 2884   format %{ %}
 2885   interface(REG_INTER);
 2886 %}
 2887 
 2888 // Replaces legVec during post-selection cleanup. See above.
 2889 operand legVecS() %{
 2890   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2891   match(VecS);
 2892 
 2893   format %{ %}
 2894   interface(REG_INTER);
 2895 %}
 2896 
 2897 // Replaces vec during post-selection cleanup. See above.
 2898 operand vecD() %{
 2899   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2900   match(VecD);
 2901 
 2902   format %{ %}
 2903   interface(REG_INTER);
 2904 %}
 2905 
 2906 // Replaces legVec during post-selection cleanup. See above.
 2907 operand legVecD() %{
 2908   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2909   match(VecD);
 2910 
 2911   format %{ %}
 2912   interface(REG_INTER);
 2913 %}
 2914 
 2915 // Replaces vec during post-selection cleanup. See above.
 2916 operand vecX() %{
 2917   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2918   match(VecX);
 2919 
 2920   format %{ %}
 2921   interface(REG_INTER);
 2922 %}
 2923 
 2924 // Replaces legVec during post-selection cleanup. See above.
 2925 operand legVecX() %{
 2926   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2927   match(VecX);
 2928 
 2929   format %{ %}
 2930   interface(REG_INTER);
 2931 %}
 2932 
 2933 // Replaces vec during post-selection cleanup. See above.
 2934 operand vecY() %{
 2935   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2936   match(VecY);
 2937 
 2938   format %{ %}
 2939   interface(REG_INTER);
 2940 %}
 2941 
 2942 // Replaces legVec during post-selection cleanup. See above.
 2943 operand legVecY() %{
 2944   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2945   match(VecY);
 2946 
 2947   format %{ %}
 2948   interface(REG_INTER);
 2949 %}
 2950 
 2951 // Replaces vec during post-selection cleanup. See above.
 2952 operand vecZ() %{
 2953   constraint(ALLOC_IN_RC(vectorz_reg));
 2954   match(VecZ);
 2955 
 2956   format %{ %}
 2957   interface(REG_INTER);
 2958 %}
 2959 
 2960 // Replaces legVec during post-selection cleanup. See above.
 2961 operand legVecZ() %{
 2962   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2963   match(VecZ);
 2964 
 2965   format %{ %}
 2966   interface(REG_INTER);
 2967 %}
 2968 
 2969 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2970 
 2971 // ============================================================================
 2972 
 2973 instruct ShouldNotReachHere() %{
 2974   match(Halt);
 2975   format %{ "stop\t# ShouldNotReachHere" %}
 2976   ins_encode %{
 2977     if (is_reachable()) {
 2978       __ stop(_halt_reason);
 2979     }
 2980   %}
 2981   ins_pipe(pipe_slow);
 2982 %}
 2983 
 2984 // ============================================================================
 2985 
 2986 instruct addF_reg(regF dst, regF src) %{
 2987   predicate((UseSSE>=1) && (UseAVX == 0));
 2988   match(Set dst (AddF dst src));
 2989 
 2990   format %{ "addss   $dst, $src" %}
 2991   ins_cost(150);
 2992   ins_encode %{
 2993     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2994   %}
 2995   ins_pipe(pipe_slow);
 2996 %}
 2997 
 2998 instruct addF_mem(regF dst, memory src) %{
 2999   predicate((UseSSE>=1) && (UseAVX == 0));
 3000   match(Set dst (AddF dst (LoadF src)));
 3001 
 3002   format %{ "addss   $dst, $src" %}
 3003   ins_cost(150);
 3004   ins_encode %{
 3005     __ addss($dst$$XMMRegister, $src$$Address);
 3006   %}
 3007   ins_pipe(pipe_slow);
 3008 %}
 3009 
 3010 instruct addF_imm(regF dst, immF con) %{
 3011   predicate((UseSSE>=1) && (UseAVX == 0));
 3012   match(Set dst (AddF dst con));
 3013   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3014   ins_cost(150);
 3015   ins_encode %{
 3016     __ addss($dst$$XMMRegister, $constantaddress($con));
 3017   %}
 3018   ins_pipe(pipe_slow);
 3019 %}
 3020 
 3021 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3022   predicate(UseAVX > 0);
 3023   match(Set dst (AddF src1 src2));
 3024 
 3025   format %{ "vaddss  $dst, $src1, $src2" %}
 3026   ins_cost(150);
 3027   ins_encode %{
 3028     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3029   %}
 3030   ins_pipe(pipe_slow);
 3031 %}
 3032 
 3033 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3034   predicate(UseAVX > 0);
 3035   match(Set dst (AddF src1 (LoadF src2)));
 3036 
 3037   format %{ "vaddss  $dst, $src1, $src2" %}
 3038   ins_cost(150);
 3039   ins_encode %{
 3040     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3041   %}
 3042   ins_pipe(pipe_slow);
 3043 %}
 3044 
 3045 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3046   predicate(UseAVX > 0);
 3047   match(Set dst (AddF src con));
 3048 
 3049   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3050   ins_cost(150);
 3051   ins_encode %{
 3052     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3053   %}
 3054   ins_pipe(pipe_slow);
 3055 %}
 3056 
 3057 instruct addD_reg(regD dst, regD src) %{
 3058   predicate((UseSSE>=2) && (UseAVX == 0));
 3059   match(Set dst (AddD dst src));
 3060 
 3061   format %{ "addsd   $dst, $src" %}
 3062   ins_cost(150);
 3063   ins_encode %{
 3064     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3065   %}
 3066   ins_pipe(pipe_slow);
 3067 %}
 3068 
 3069 instruct addD_mem(regD dst, memory src) %{
 3070   predicate((UseSSE>=2) && (UseAVX == 0));
 3071   match(Set dst (AddD dst (LoadD src)));
 3072 
 3073   format %{ "addsd   $dst, $src" %}
 3074   ins_cost(150);
 3075   ins_encode %{
 3076     __ addsd($dst$$XMMRegister, $src$$Address);
 3077   %}
 3078   ins_pipe(pipe_slow);
 3079 %}
 3080 
 3081 instruct addD_imm(regD dst, immD con) %{
 3082   predicate((UseSSE>=2) && (UseAVX == 0));
 3083   match(Set dst (AddD dst con));
 3084   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3085   ins_cost(150);
 3086   ins_encode %{
 3087     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3088   %}
 3089   ins_pipe(pipe_slow);
 3090 %}
 3091 
 3092 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3093   predicate(UseAVX > 0);
 3094   match(Set dst (AddD src1 src2));
 3095 
 3096   format %{ "vaddsd  $dst, $src1, $src2" %}
 3097   ins_cost(150);
 3098   ins_encode %{
 3099     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3100   %}
 3101   ins_pipe(pipe_slow);
 3102 %}
 3103 
 3104 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3105   predicate(UseAVX > 0);
 3106   match(Set dst (AddD src1 (LoadD src2)));
 3107 
 3108   format %{ "vaddsd  $dst, $src1, $src2" %}
 3109   ins_cost(150);
 3110   ins_encode %{
 3111     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3112   %}
 3113   ins_pipe(pipe_slow);
 3114 %}
 3115 
 3116 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3117   predicate(UseAVX > 0);
 3118   match(Set dst (AddD src con));
 3119 
 3120   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3121   ins_cost(150);
 3122   ins_encode %{
 3123     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3124   %}
 3125   ins_pipe(pipe_slow);
 3126 %}
 3127 
 3128 instruct subF_reg(regF dst, regF src) %{
 3129   predicate((UseSSE>=1) && (UseAVX == 0));
 3130   match(Set dst (SubF dst src));
 3131 
 3132   format %{ "subss   $dst, $src" %}
 3133   ins_cost(150);
 3134   ins_encode %{
 3135     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3136   %}
 3137   ins_pipe(pipe_slow);
 3138 %}
 3139 
 3140 instruct subF_mem(regF dst, memory src) %{
 3141   predicate((UseSSE>=1) && (UseAVX == 0));
 3142   match(Set dst (SubF dst (LoadF src)));
 3143 
 3144   format %{ "subss   $dst, $src" %}
 3145   ins_cost(150);
 3146   ins_encode %{
 3147     __ subss($dst$$XMMRegister, $src$$Address);
 3148   %}
 3149   ins_pipe(pipe_slow);
 3150 %}
 3151 
 3152 instruct subF_imm(regF dst, immF con) %{
 3153   predicate((UseSSE>=1) && (UseAVX == 0));
 3154   match(Set dst (SubF dst con));
 3155   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3156   ins_cost(150);
 3157   ins_encode %{
 3158     __ subss($dst$$XMMRegister, $constantaddress($con));
 3159   %}
 3160   ins_pipe(pipe_slow);
 3161 %}
 3162 
 3163 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3164   predicate(UseAVX > 0);
 3165   match(Set dst (SubF src1 src2));
 3166 
 3167   format %{ "vsubss  $dst, $src1, $src2" %}
 3168   ins_cost(150);
 3169   ins_encode %{
 3170     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3171   %}
 3172   ins_pipe(pipe_slow);
 3173 %}
 3174 
 3175 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3176   predicate(UseAVX > 0);
 3177   match(Set dst (SubF src1 (LoadF src2)));
 3178 
 3179   format %{ "vsubss  $dst, $src1, $src2" %}
 3180   ins_cost(150);
 3181   ins_encode %{
 3182     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3183   %}
 3184   ins_pipe(pipe_slow);
 3185 %}
 3186 
 3187 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3188   predicate(UseAVX > 0);
 3189   match(Set dst (SubF src con));
 3190 
 3191   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3192   ins_cost(150);
 3193   ins_encode %{
 3194     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3195   %}
 3196   ins_pipe(pipe_slow);
 3197 %}
 3198 
 3199 instruct subD_reg(regD dst, regD src) %{
 3200   predicate((UseSSE>=2) && (UseAVX == 0));
 3201   match(Set dst (SubD dst src));
 3202 
 3203   format %{ "subsd   $dst, $src" %}
 3204   ins_cost(150);
 3205   ins_encode %{
 3206     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3207   %}
 3208   ins_pipe(pipe_slow);
 3209 %}
 3210 
 3211 instruct subD_mem(regD dst, memory src) %{
 3212   predicate((UseSSE>=2) && (UseAVX == 0));
 3213   match(Set dst (SubD dst (LoadD src)));
 3214 
 3215   format %{ "subsd   $dst, $src" %}
 3216   ins_cost(150);
 3217   ins_encode %{
 3218     __ subsd($dst$$XMMRegister, $src$$Address);
 3219   %}
 3220   ins_pipe(pipe_slow);
 3221 %}
 3222 
 3223 instruct subD_imm(regD dst, immD con) %{
 3224   predicate((UseSSE>=2) && (UseAVX == 0));
 3225   match(Set dst (SubD dst con));
 3226   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3227   ins_cost(150);
 3228   ins_encode %{
 3229     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3230   %}
 3231   ins_pipe(pipe_slow);
 3232 %}
 3233 
 3234 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3235   predicate(UseAVX > 0);
 3236   match(Set dst (SubD src1 src2));
 3237 
 3238   format %{ "vsubsd  $dst, $src1, $src2" %}
 3239   ins_cost(150);
 3240   ins_encode %{
 3241     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3242   %}
 3243   ins_pipe(pipe_slow);
 3244 %}
 3245 
 3246 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3247   predicate(UseAVX > 0);
 3248   match(Set dst (SubD src1 (LoadD src2)));
 3249 
 3250   format %{ "vsubsd  $dst, $src1, $src2" %}
 3251   ins_cost(150);
 3252   ins_encode %{
 3253     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3254   %}
 3255   ins_pipe(pipe_slow);
 3256 %}
 3257 
 3258 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3259   predicate(UseAVX > 0);
 3260   match(Set dst (SubD src con));
 3261 
 3262   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3263   ins_cost(150);
 3264   ins_encode %{
 3265     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3266   %}
 3267   ins_pipe(pipe_slow);
 3268 %}
 3269 
 3270 instruct mulF_reg(regF dst, regF src) %{
 3271   predicate((UseSSE>=1) && (UseAVX == 0));
 3272   match(Set dst (MulF dst src));
 3273 
 3274   format %{ "mulss   $dst, $src" %}
 3275   ins_cost(150);
 3276   ins_encode %{
 3277     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3278   %}
 3279   ins_pipe(pipe_slow);
 3280 %}
 3281 
 3282 instruct mulF_mem(regF dst, memory src) %{
 3283   predicate((UseSSE>=1) && (UseAVX == 0));
 3284   match(Set dst (MulF dst (LoadF src)));
 3285 
 3286   format %{ "mulss   $dst, $src" %}
 3287   ins_cost(150);
 3288   ins_encode %{
 3289     __ mulss($dst$$XMMRegister, $src$$Address);
 3290   %}
 3291   ins_pipe(pipe_slow);
 3292 %}
 3293 
 3294 instruct mulF_imm(regF dst, immF con) %{
 3295   predicate((UseSSE>=1) && (UseAVX == 0));
 3296   match(Set dst (MulF dst con));
 3297   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3298   ins_cost(150);
 3299   ins_encode %{
 3300     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3301   %}
 3302   ins_pipe(pipe_slow);
 3303 %}
 3304 
 3305 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3306   predicate(UseAVX > 0);
 3307   match(Set dst (MulF src1 src2));
 3308 
 3309   format %{ "vmulss  $dst, $src1, $src2" %}
 3310   ins_cost(150);
 3311   ins_encode %{
 3312     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3313   %}
 3314   ins_pipe(pipe_slow);
 3315 %}
 3316 
 3317 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3318   predicate(UseAVX > 0);
 3319   match(Set dst (MulF src1 (LoadF src2)));
 3320 
 3321   format %{ "vmulss  $dst, $src1, $src2" %}
 3322   ins_cost(150);
 3323   ins_encode %{
 3324     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3325   %}
 3326   ins_pipe(pipe_slow);
 3327 %}
 3328 
 3329 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3330   predicate(UseAVX > 0);
 3331   match(Set dst (MulF src con));
 3332 
 3333   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3334   ins_cost(150);
 3335   ins_encode %{
 3336     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3337   %}
 3338   ins_pipe(pipe_slow);
 3339 %}
 3340 
 3341 instruct mulD_reg(regD dst, regD src) %{
 3342   predicate((UseSSE>=2) && (UseAVX == 0));
 3343   match(Set dst (MulD dst src));
 3344 
 3345   format %{ "mulsd   $dst, $src" %}
 3346   ins_cost(150);
 3347   ins_encode %{
 3348     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3349   %}
 3350   ins_pipe(pipe_slow);
 3351 %}
 3352 
 3353 instruct mulD_mem(regD dst, memory src) %{
 3354   predicate((UseSSE>=2) && (UseAVX == 0));
 3355   match(Set dst (MulD dst (LoadD src)));
 3356 
 3357   format %{ "mulsd   $dst, $src" %}
 3358   ins_cost(150);
 3359   ins_encode %{
 3360     __ mulsd($dst$$XMMRegister, $src$$Address);
 3361   %}
 3362   ins_pipe(pipe_slow);
 3363 %}
 3364 
 3365 instruct mulD_imm(regD dst, immD con) %{
 3366   predicate((UseSSE>=2) && (UseAVX == 0));
 3367   match(Set dst (MulD dst con));
 3368   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3369   ins_cost(150);
 3370   ins_encode %{
 3371     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3372   %}
 3373   ins_pipe(pipe_slow);
 3374 %}
 3375 
 3376 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3377   predicate(UseAVX > 0);
 3378   match(Set dst (MulD src1 src2));
 3379 
 3380   format %{ "vmulsd  $dst, $src1, $src2" %}
 3381   ins_cost(150);
 3382   ins_encode %{
 3383     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3384   %}
 3385   ins_pipe(pipe_slow);
 3386 %}
 3387 
 3388 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3389   predicate(UseAVX > 0);
 3390   match(Set dst (MulD src1 (LoadD src2)));
 3391 
 3392   format %{ "vmulsd  $dst, $src1, $src2" %}
 3393   ins_cost(150);
 3394   ins_encode %{
 3395     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3396   %}
 3397   ins_pipe(pipe_slow);
 3398 %}
 3399 
 3400 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3401   predicate(UseAVX > 0);
 3402   match(Set dst (MulD src con));
 3403 
 3404   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3405   ins_cost(150);
 3406   ins_encode %{
 3407     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3408   %}
 3409   ins_pipe(pipe_slow);
 3410 %}
 3411 
 3412 instruct divF_reg(regF dst, regF src) %{
 3413   predicate((UseSSE>=1) && (UseAVX == 0));
 3414   match(Set dst (DivF dst src));
 3415 
 3416   format %{ "divss   $dst, $src" %}
 3417   ins_cost(150);
 3418   ins_encode %{
 3419     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3420   %}
 3421   ins_pipe(pipe_slow);
 3422 %}
 3423 
 3424 instruct divF_mem(regF dst, memory src) %{
 3425   predicate((UseSSE>=1) && (UseAVX == 0));
 3426   match(Set dst (DivF dst (LoadF src)));
 3427 
 3428   format %{ "divss   $dst, $src" %}
 3429   ins_cost(150);
 3430   ins_encode %{
 3431     __ divss($dst$$XMMRegister, $src$$Address);
 3432   %}
 3433   ins_pipe(pipe_slow);
 3434 %}
 3435 
 3436 instruct divF_imm(regF dst, immF con) %{
 3437   predicate((UseSSE>=1) && (UseAVX == 0));
 3438   match(Set dst (DivF dst con));
 3439   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3440   ins_cost(150);
 3441   ins_encode %{
 3442     __ divss($dst$$XMMRegister, $constantaddress($con));
 3443   %}
 3444   ins_pipe(pipe_slow);
 3445 %}
 3446 
 3447 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3448   predicate(UseAVX > 0);
 3449   match(Set dst (DivF src1 src2));
 3450 
 3451   format %{ "vdivss  $dst, $src1, $src2" %}
 3452   ins_cost(150);
 3453   ins_encode %{
 3454     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3455   %}
 3456   ins_pipe(pipe_slow);
 3457 %}
 3458 
 3459 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3460   predicate(UseAVX > 0);
 3461   match(Set dst (DivF src1 (LoadF src2)));
 3462 
 3463   format %{ "vdivss  $dst, $src1, $src2" %}
 3464   ins_cost(150);
 3465   ins_encode %{
 3466     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3467   %}
 3468   ins_pipe(pipe_slow);
 3469 %}
 3470 
 3471 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3472   predicate(UseAVX > 0);
 3473   match(Set dst (DivF src con));
 3474 
 3475   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3476   ins_cost(150);
 3477   ins_encode %{
 3478     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3479   %}
 3480   ins_pipe(pipe_slow);
 3481 %}
 3482 
 3483 instruct divD_reg(regD dst, regD src) %{
 3484   predicate((UseSSE>=2) && (UseAVX == 0));
 3485   match(Set dst (DivD dst src));
 3486 
 3487   format %{ "divsd   $dst, $src" %}
 3488   ins_cost(150);
 3489   ins_encode %{
 3490     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3491   %}
 3492   ins_pipe(pipe_slow);
 3493 %}
 3494 
 3495 instruct divD_mem(regD dst, memory src) %{
 3496   predicate((UseSSE>=2) && (UseAVX == 0));
 3497   match(Set dst (DivD dst (LoadD src)));
 3498 
 3499   format %{ "divsd   $dst, $src" %}
 3500   ins_cost(150);
 3501   ins_encode %{
 3502     __ divsd($dst$$XMMRegister, $src$$Address);
 3503   %}
 3504   ins_pipe(pipe_slow);
 3505 %}
 3506 
 3507 instruct divD_imm(regD dst, immD con) %{
 3508   predicate((UseSSE>=2) && (UseAVX == 0));
 3509   match(Set dst (DivD dst con));
 3510   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3511   ins_cost(150);
 3512   ins_encode %{
 3513     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3514   %}
 3515   ins_pipe(pipe_slow);
 3516 %}
 3517 
 3518 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3519   predicate(UseAVX > 0);
 3520   match(Set dst (DivD src1 src2));
 3521 
 3522   format %{ "vdivsd  $dst, $src1, $src2" %}
 3523   ins_cost(150);
 3524   ins_encode %{
 3525     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3526   %}
 3527   ins_pipe(pipe_slow);
 3528 %}
 3529 
 3530 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3531   predicate(UseAVX > 0);
 3532   match(Set dst (DivD src1 (LoadD src2)));
 3533 
 3534   format %{ "vdivsd  $dst, $src1, $src2" %}
 3535   ins_cost(150);
 3536   ins_encode %{
 3537     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3538   %}
 3539   ins_pipe(pipe_slow);
 3540 %}
 3541 
 3542 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3543   predicate(UseAVX > 0);
 3544   match(Set dst (DivD src con));
 3545 
 3546   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3547   ins_cost(150);
 3548   ins_encode %{
 3549     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3550   %}
 3551   ins_pipe(pipe_slow);
 3552 %}
 3553 
 3554 instruct absF_reg(regF dst) %{
 3555   predicate((UseSSE>=1) && (UseAVX == 0));
 3556   match(Set dst (AbsF dst));
 3557   ins_cost(150);
 3558   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3559   ins_encode %{
 3560     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3561   %}
 3562   ins_pipe(pipe_slow);
 3563 %}
 3564 
 3565 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3566   predicate(UseAVX > 0);
 3567   match(Set dst (AbsF src));
 3568   ins_cost(150);
 3569   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3570   ins_encode %{
 3571     int vlen_enc = Assembler::AVX_128bit;
 3572     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3573               ExternalAddress(float_signmask()), vlen_enc);
 3574   %}
 3575   ins_pipe(pipe_slow);
 3576 %}
 3577 
 3578 instruct absD_reg(regD dst) %{
 3579   predicate((UseSSE>=2) && (UseAVX == 0));
 3580   match(Set dst (AbsD dst));
 3581   ins_cost(150);
 3582   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3583             "# abs double by sign masking" %}
 3584   ins_encode %{
 3585     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3586   %}
 3587   ins_pipe(pipe_slow);
 3588 %}
 3589 
 3590 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3591   predicate(UseAVX > 0);
 3592   match(Set dst (AbsD src));
 3593   ins_cost(150);
 3594   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3595             "# abs double by sign masking" %}
 3596   ins_encode %{
 3597     int vlen_enc = Assembler::AVX_128bit;
 3598     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3599               ExternalAddress(double_signmask()), vlen_enc);
 3600   %}
 3601   ins_pipe(pipe_slow);
 3602 %}
 3603 
 3604 instruct negF_reg(regF dst) %{
 3605   predicate((UseSSE>=1) && (UseAVX == 0));
 3606   match(Set dst (NegF dst));
 3607   ins_cost(150);
 3608   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3609   ins_encode %{
 3610     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3611   %}
 3612   ins_pipe(pipe_slow);
 3613 %}
 3614 
 3615 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3616   predicate(UseAVX > 0);
 3617   match(Set dst (NegF src));
 3618   ins_cost(150);
 3619   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3620   ins_encode %{
 3621     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3622                  ExternalAddress(float_signflip()));
 3623   %}
 3624   ins_pipe(pipe_slow);
 3625 %}
 3626 
 3627 instruct negD_reg(regD dst) %{
 3628   predicate((UseSSE>=2) && (UseAVX == 0));
 3629   match(Set dst (NegD dst));
 3630   ins_cost(150);
 3631   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3632             "# neg double by sign flipping" %}
 3633   ins_encode %{
 3634     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3635   %}
 3636   ins_pipe(pipe_slow);
 3637 %}
 3638 
 3639 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3640   predicate(UseAVX > 0);
 3641   match(Set dst (NegD src));
 3642   ins_cost(150);
 3643   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3644             "# neg double by sign flipping" %}
 3645   ins_encode %{
 3646     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3647                  ExternalAddress(double_signflip()));
 3648   %}
 3649   ins_pipe(pipe_slow);
 3650 %}
 3651 
 3652 // sqrtss instruction needs destination register to be pre initialized for best performance
 3653 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3654 instruct sqrtF_reg(regF dst) %{
 3655   predicate(UseSSE>=1);
 3656   match(Set dst (SqrtF dst));
 3657   format %{ "sqrtss  $dst, $dst" %}
 3658   ins_encode %{
 3659     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3660   %}
 3661   ins_pipe(pipe_slow);
 3662 %}
 3663 
 3664 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3665 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3666 instruct sqrtD_reg(regD dst) %{
 3667   predicate(UseSSE>=2);
 3668   match(Set dst (SqrtD dst));
 3669   format %{ "sqrtsd  $dst, $dst" %}
 3670   ins_encode %{
 3671     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3672   %}
 3673   ins_pipe(pipe_slow);
 3674 %}
 3675 
 3676 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3677   effect(TEMP tmp);
 3678   match(Set dst (ConvF2HF src));
 3679   ins_cost(125);
 3680   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3681   ins_encode %{
 3682     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3683   %}
 3684   ins_pipe( pipe_slow );
 3685 %}
 3686 
 3687 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3688   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3689   effect(TEMP ktmp, TEMP rtmp);
 3690   match(Set mem (StoreC mem (ConvF2HF src)));
 3691   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3692   ins_encode %{
 3693     __ movl($rtmp$$Register, 0x1);
 3694     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3695     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3696   %}
 3697   ins_pipe( pipe_slow );
 3698 %}
 3699 
 3700 instruct vconvF2HF(vec dst, vec src) %{
 3701   match(Set dst (VectorCastF2HF src));
 3702   format %{ "vector_conv_F2HF $dst $src" %}
 3703   ins_encode %{
 3704     int vlen_enc = vector_length_encoding(this, $src);
 3705     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3706   %}
 3707   ins_pipe( pipe_slow );
 3708 %}
 3709 
 3710 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3711   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3712   format %{ "vcvtps2ph $mem,$src" %}
 3713   ins_encode %{
 3714     int vlen_enc = vector_length_encoding(this, $src);
 3715     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3716   %}
 3717   ins_pipe( pipe_slow );
 3718 %}
 3719 
 3720 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3721   match(Set dst (ConvHF2F src));
 3722   format %{ "vcvtph2ps $dst,$src" %}
 3723   ins_encode %{
 3724     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3725   %}
 3726   ins_pipe( pipe_slow );
 3727 %}
 3728 
 3729 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3730   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3731   format %{ "vcvtph2ps $dst,$mem" %}
 3732   ins_encode %{
 3733     int vlen_enc = vector_length_encoding(this);
 3734     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3735   %}
 3736   ins_pipe( pipe_slow );
 3737 %}
 3738 
 3739 instruct vconvHF2F(vec dst, vec src) %{
 3740   match(Set dst (VectorCastHF2F src));
 3741   ins_cost(125);
 3742   format %{ "vector_conv_HF2F $dst,$src" %}
 3743   ins_encode %{
 3744     int vlen_enc = vector_length_encoding(this);
 3745     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3746   %}
 3747   ins_pipe( pipe_slow );
 3748 %}
 3749 
 3750 // ---------------------------------------- VectorReinterpret ------------------------------------
 3751 instruct reinterpret_mask(kReg dst) %{
 3752   predicate(n->bottom_type()->isa_vectmask() &&
 3753             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3754   match(Set dst (VectorReinterpret dst));
 3755   ins_cost(125);
 3756   format %{ "vector_reinterpret $dst\t!" %}
 3757   ins_encode %{
 3758     // empty
 3759   %}
 3760   ins_pipe( pipe_slow );
 3761 %}
 3762 
 3763 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3764   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3765             n->bottom_type()->isa_vectmask() &&
 3766             n->in(1)->bottom_type()->isa_vectmask() &&
 3767             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3768             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3769   match(Set dst (VectorReinterpret src));
 3770   effect(TEMP xtmp);
 3771   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3772   ins_encode %{
 3773      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3774      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3775      assert(src_sz == dst_sz , "src and dst size mismatch");
 3776      int vlen_enc = vector_length_encoding(src_sz);
 3777      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3778      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3779   %}
 3780   ins_pipe( pipe_slow );
 3781 %}
 3782 
 3783 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3784   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3785             n->bottom_type()->isa_vectmask() &&
 3786             n->in(1)->bottom_type()->isa_vectmask() &&
 3787             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3788              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3789             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3790   match(Set dst (VectorReinterpret src));
 3791   effect(TEMP xtmp);
 3792   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3793   ins_encode %{
 3794      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3795      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3796      assert(src_sz == dst_sz , "src and dst size mismatch");
 3797      int vlen_enc = vector_length_encoding(src_sz);
 3798      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3799      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3800   %}
 3801   ins_pipe( pipe_slow );
 3802 %}
 3803 
 3804 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3805   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3806             n->bottom_type()->isa_vectmask() &&
 3807             n->in(1)->bottom_type()->isa_vectmask() &&
 3808             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3809              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3810             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3811   match(Set dst (VectorReinterpret src));
 3812   effect(TEMP xtmp);
 3813   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3814   ins_encode %{
 3815      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3816      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3817      assert(src_sz == dst_sz , "src and dst size mismatch");
 3818      int vlen_enc = vector_length_encoding(src_sz);
 3819      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3820      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3821   %}
 3822   ins_pipe( pipe_slow );
 3823 %}
 3824 
 3825 instruct reinterpret(vec dst) %{
 3826   predicate(!n->bottom_type()->isa_vectmask() &&
 3827             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3828   match(Set dst (VectorReinterpret dst));
 3829   ins_cost(125);
 3830   format %{ "vector_reinterpret $dst\t!" %}
 3831   ins_encode %{
 3832     // empty
 3833   %}
 3834   ins_pipe( pipe_slow );
 3835 %}
 3836 
 3837 instruct reinterpret_expand(vec dst, vec src) %{
 3838   predicate(UseAVX == 0 &&
 3839             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3840   match(Set dst (VectorReinterpret src));
 3841   ins_cost(125);
 3842   effect(TEMP dst);
 3843   format %{ "vector_reinterpret_expand $dst,$src" %}
 3844   ins_encode %{
 3845     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3846     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3847 
 3848     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3849     if (src_vlen_in_bytes == 4) {
 3850       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3851     } else {
 3852       assert(src_vlen_in_bytes == 8, "");
 3853       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3854     }
 3855     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3856   %}
 3857   ins_pipe( pipe_slow );
 3858 %}
 3859 
 3860 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3861   predicate(UseAVX > 0 &&
 3862             !n->bottom_type()->isa_vectmask() &&
 3863             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3864             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3865   match(Set dst (VectorReinterpret src));
 3866   ins_cost(125);
 3867   format %{ "vector_reinterpret_expand $dst,$src" %}
 3868   ins_encode %{
 3869     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3870   %}
 3871   ins_pipe( pipe_slow );
 3872 %}
 3873 
 3874 
 3875 instruct vreinterpret_expand(legVec dst, vec src) %{
 3876   predicate(UseAVX > 0 &&
 3877             !n->bottom_type()->isa_vectmask() &&
 3878             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3879             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3880   match(Set dst (VectorReinterpret src));
 3881   ins_cost(125);
 3882   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3883   ins_encode %{
 3884     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3885       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3886       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3887       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3888       default: ShouldNotReachHere();
 3889     }
 3890   %}
 3891   ins_pipe( pipe_slow );
 3892 %}
 3893 
 3894 instruct reinterpret_shrink(vec dst, legVec src) %{
 3895   predicate(!n->bottom_type()->isa_vectmask() &&
 3896             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3897   match(Set dst (VectorReinterpret src));
 3898   ins_cost(125);
 3899   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3900   ins_encode %{
 3901     switch (Matcher::vector_length_in_bytes(this)) {
 3902       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3903       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3904       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3905       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3906       default: ShouldNotReachHere();
 3907     }
 3908   %}
 3909   ins_pipe( pipe_slow );
 3910 %}
 3911 
 3912 // ----------------------------------------------------------------------------------------------------
 3913 
 3914 #ifdef _LP64
 3915 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3916   match(Set dst (RoundDoubleMode src rmode));
 3917   format %{ "roundsd $dst,$src" %}
 3918   ins_cost(150);
 3919   ins_encode %{
 3920     assert(UseSSE >= 4, "required");
 3921     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3922   %}
 3923   ins_pipe(pipe_slow);
 3924 %}
 3925 
 3926 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3927   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3928   format %{ "roundsd $dst,$src" %}
 3929   ins_cost(150);
 3930   ins_encode %{
 3931     assert(UseSSE >= 4, "required");
 3932     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3933   %}
 3934   ins_pipe(pipe_slow);
 3935 %}
 3936 
 3937 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3938   match(Set dst (RoundDoubleMode con rmode));
 3939   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3940   ins_cost(150);
 3941   ins_encode %{
 3942     assert(UseSSE >= 4, "required");
 3943     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3944   %}
 3945   ins_pipe(pipe_slow);
 3946 %}
 3947 
 3948 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3949   predicate(Matcher::vector_length(n) < 8);
 3950   match(Set dst (RoundDoubleModeV src rmode));
 3951   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3952   ins_encode %{
 3953     assert(UseAVX > 0, "required");
 3954     int vlen_enc = vector_length_encoding(this);
 3955     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3956   %}
 3957   ins_pipe( pipe_slow );
 3958 %}
 3959 
 3960 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3961   predicate(Matcher::vector_length(n) == 8);
 3962   match(Set dst (RoundDoubleModeV src rmode));
 3963   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3964   ins_encode %{
 3965     assert(UseAVX > 2, "required");
 3966     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3967   %}
 3968   ins_pipe( pipe_slow );
 3969 %}
 3970 
 3971 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3972   predicate(Matcher::vector_length(n) < 8);
 3973   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3974   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3975   ins_encode %{
 3976     assert(UseAVX > 0, "required");
 3977     int vlen_enc = vector_length_encoding(this);
 3978     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3979   %}
 3980   ins_pipe( pipe_slow );
 3981 %}
 3982 
 3983 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3984   predicate(Matcher::vector_length(n) == 8);
 3985   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3986   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3987   ins_encode %{
 3988     assert(UseAVX > 2, "required");
 3989     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3990   %}
 3991   ins_pipe( pipe_slow );
 3992 %}
 3993 #endif // _LP64
 3994 
 3995 instruct onspinwait() %{
 3996   match(OnSpinWait);
 3997   ins_cost(200);
 3998 
 3999   format %{
 4000     $$template
 4001     $$emit$$"pause\t! membar_onspinwait"
 4002   %}
 4003   ins_encode %{
 4004     __ pause();
 4005   %}
 4006   ins_pipe(pipe_slow);
 4007 %}
 4008 
 4009 // a * b + c
 4010 instruct fmaD_reg(regD a, regD b, regD c) %{
 4011   match(Set c (FmaD  c (Binary a b)));
 4012   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4013   ins_cost(150);
 4014   ins_encode %{
 4015     assert(UseFMA, "Needs FMA instructions support.");
 4016     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4017   %}
 4018   ins_pipe( pipe_slow );
 4019 %}
 4020 
 4021 // a * b + c
 4022 instruct fmaF_reg(regF a, regF b, regF c) %{
 4023   match(Set c (FmaF  c (Binary a b)));
 4024   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4025   ins_cost(150);
 4026   ins_encode %{
 4027     assert(UseFMA, "Needs FMA instructions support.");
 4028     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4029   %}
 4030   ins_pipe( pipe_slow );
 4031 %}
 4032 
 4033 // ====================VECTOR INSTRUCTIONS=====================================
 4034 
 4035 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4036 instruct MoveVec2Leg(legVec dst, vec src) %{
 4037   match(Set dst src);
 4038   format %{ "" %}
 4039   ins_encode %{
 4040     ShouldNotReachHere();
 4041   %}
 4042   ins_pipe( fpu_reg_reg );
 4043 %}
 4044 
 4045 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4046   match(Set dst src);
 4047   format %{ "" %}
 4048   ins_encode %{
 4049     ShouldNotReachHere();
 4050   %}
 4051   ins_pipe( fpu_reg_reg );
 4052 %}
 4053 
 4054 // ============================================================================
 4055 
 4056 // Load vectors generic operand pattern
 4057 instruct loadV(vec dst, memory mem) %{
 4058   match(Set dst (LoadVector mem));
 4059   ins_cost(125);
 4060   format %{ "load_vector $dst,$mem" %}
 4061   ins_encode %{
 4062     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4063   %}
 4064   ins_pipe( pipe_slow );
 4065 %}
 4066 
 4067 // Store vectors generic operand pattern.
 4068 instruct storeV(memory mem, vec src) %{
 4069   match(Set mem (StoreVector mem src));
 4070   ins_cost(145);
 4071   format %{ "store_vector $mem,$src\n\t" %}
 4072   ins_encode %{
 4073     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4074       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4075       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4076       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4077       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4078       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4079       default: ShouldNotReachHere();
 4080     }
 4081   %}
 4082   ins_pipe( pipe_slow );
 4083 %}
 4084 
 4085 // ---------------------------------------- Gather ------------------------------------
 4086 
 4087 // Gather INT, LONG, FLOAT, DOUBLE
 4088 
 4089 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4090   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4091   match(Set dst (LoadVectorGather mem idx));
 4092   effect(TEMP dst, TEMP tmp, TEMP mask);
 4093   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4094   ins_encode %{
 4095     assert(UseAVX >= 2, "sanity");
 4096 
 4097     int vlen_enc = vector_length_encoding(this);
 4098     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4099 
 4100     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 4101     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4102 
 4103     if (vlen_enc == Assembler::AVX_128bit) {
 4104       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4105     } else {
 4106       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4107     }
 4108     __ lea($tmp$$Register, $mem$$Address);
 4109     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4110   %}
 4111   ins_pipe( pipe_slow );
 4112 %}
 4113 
 4114 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4115   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4116   match(Set dst (LoadVectorGather mem idx));
 4117   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4118   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4119   ins_encode %{
 4120     assert(UseAVX > 2, "sanity");
 4121 
 4122     int vlen_enc = vector_length_encoding(this);
 4123     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4124 
 4125     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4126 
 4127     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4128     __ lea($tmp$$Register, $mem$$Address);
 4129     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4130   %}
 4131   ins_pipe( pipe_slow );
 4132 %}
 4133 
 4134 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4135   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4136   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4137   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4138   ins_encode %{
 4139     assert(UseAVX > 2, "sanity");
 4140     int vlen_enc = vector_length_encoding(this);
 4141     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4142     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4143     // Note: Since gather instruction partially updates the opmask register used
 4144     // for predication hense moving mask operand to a temporary.
 4145     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4146     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4147     __ lea($tmp$$Register, $mem$$Address);
 4148     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4149   %}
 4150   ins_pipe( pipe_slow );
 4151 %}
 4152 // ====================Scatter=======================================
 4153 
 4154 // Scatter INT, LONG, FLOAT, DOUBLE
 4155 
 4156 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4157   predicate(UseAVX > 2);
 4158   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4159   effect(TEMP tmp, TEMP ktmp);
 4160   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4161   ins_encode %{
 4162     int vlen_enc = vector_length_encoding(this, $src);
 4163     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4164 
 4165     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4166     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4167 
 4168     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4169     __ lea($tmp$$Register, $mem$$Address);
 4170     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4171   %}
 4172   ins_pipe( pipe_slow );
 4173 %}
 4174 
 4175 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4176   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4177   effect(TEMP tmp, TEMP ktmp);
 4178   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4179   ins_encode %{
 4180     int vlen_enc = vector_length_encoding(this, $src);
 4181     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4182     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4183     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4184     // Note: Since scatter instruction partially updates the opmask register used
 4185     // for predication hense moving mask operand to a temporary.
 4186     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4187     __ lea($tmp$$Register, $mem$$Address);
 4188     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4189   %}
 4190   ins_pipe( pipe_slow );
 4191 %}
 4192 
 4193 // ====================REPLICATE=======================================
 4194 
 4195 // Replicate byte scalar to be vector
 4196 instruct vReplB_reg(vec dst, rRegI src) %{
 4197   predicate(UseAVX >= 2);
 4198   match(Set dst (ReplicateB src));
 4199   format %{ "replicateB $dst,$src" %}
 4200   ins_encode %{
 4201     uint vlen = Matcher::vector_length(this);
 4202     int vlen_enc = vector_length_encoding(this);
 4203     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4204       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4205       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4206     } else {
 4207       __ movdl($dst$$XMMRegister, $src$$Register);
 4208       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4209     }
 4210   %}
 4211   ins_pipe( pipe_slow );
 4212 %}
 4213 
 4214 instruct ReplB_reg(vec dst, rRegI src) %{
 4215   predicate(UseAVX < 2);
 4216   match(Set dst (ReplicateB src));
 4217   format %{ "replicateB $dst,$src" %}
 4218   ins_encode %{
 4219     uint vlen = Matcher::vector_length(this);
 4220     __ movdl($dst$$XMMRegister, $src$$Register);
 4221     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4222     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4223     if (vlen >= 16) {
 4224       assert(vlen == 16, "");
 4225       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4226     }
 4227   %}
 4228   ins_pipe( pipe_slow );
 4229 %}
 4230 
 4231 instruct ReplB_mem(vec dst, memory mem) %{
 4232   predicate(UseAVX >= 2);
 4233   match(Set dst (ReplicateB (LoadB mem)));
 4234   format %{ "replicateB $dst,$mem" %}
 4235   ins_encode %{
 4236     int vlen_enc = vector_length_encoding(this);
 4237     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4238   %}
 4239   ins_pipe( pipe_slow );
 4240 %}
 4241 
 4242 // ====================ReplicateS=======================================
 4243 
 4244 instruct vReplS_reg(vec dst, rRegI src) %{
 4245   predicate(UseAVX >= 2);
 4246   match(Set dst (ReplicateS src));
 4247   format %{ "replicateS $dst,$src" %}
 4248   ins_encode %{
 4249     uint vlen = Matcher::vector_length(this);
 4250     int vlen_enc = vector_length_encoding(this);
 4251     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4252       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4253       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4254     } else {
 4255       __ movdl($dst$$XMMRegister, $src$$Register);
 4256       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4257     }
 4258   %}
 4259   ins_pipe( pipe_slow );
 4260 %}
 4261 
 4262 instruct ReplS_reg(vec dst, rRegI src) %{
 4263   predicate(UseAVX < 2);
 4264   match(Set dst (ReplicateS src));
 4265   format %{ "replicateS $dst,$src" %}
 4266   ins_encode %{
 4267     uint vlen = Matcher::vector_length(this);
 4268     int vlen_enc = vector_length_encoding(this);
 4269     __ movdl($dst$$XMMRegister, $src$$Register);
 4270     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4271     if (vlen >= 8) {
 4272       assert(vlen == 8, "");
 4273       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4274     }
 4275   %}
 4276   ins_pipe( pipe_slow );
 4277 %}
 4278 
 4279 instruct ReplS_mem(vec dst, memory mem) %{
 4280   predicate(UseAVX >= 2);
 4281   match(Set dst (ReplicateS (LoadS mem)));
 4282   format %{ "replicateS $dst,$mem" %}
 4283   ins_encode %{
 4284     int vlen_enc = vector_length_encoding(this);
 4285     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4286   %}
 4287   ins_pipe( pipe_slow );
 4288 %}
 4289 
 4290 // ====================ReplicateI=======================================
 4291 
 4292 instruct ReplI_reg(vec dst, rRegI src) %{
 4293   match(Set dst (ReplicateI src));
 4294   format %{ "replicateI $dst,$src" %}
 4295   ins_encode %{
 4296     uint vlen = Matcher::vector_length(this);
 4297     int vlen_enc = vector_length_encoding(this);
 4298     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4299       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4300     } else if (VM_Version::supports_avx2()) {
 4301       __ movdl($dst$$XMMRegister, $src$$Register);
 4302       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4303     } else {
 4304       __ movdl($dst$$XMMRegister, $src$$Register);
 4305       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4306     }
 4307   %}
 4308   ins_pipe( pipe_slow );
 4309 %}
 4310 
 4311 instruct ReplI_mem(vec dst, memory mem) %{
 4312   match(Set dst (ReplicateI (LoadI mem)));
 4313   format %{ "replicateI $dst,$mem" %}
 4314   ins_encode %{
 4315     int vlen_enc = vector_length_encoding(this);
 4316     if (VM_Version::supports_avx2()) {
 4317       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4318     } else if (VM_Version::supports_avx()) {
 4319       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4320     } else {
 4321       __ movdl($dst$$XMMRegister, $mem$$Address);
 4322       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4323     }
 4324   %}
 4325   ins_pipe( pipe_slow );
 4326 %}
 4327 
 4328 instruct ReplI_imm(vec dst, immI con) %{
 4329   match(Set dst (ReplicateB con));
 4330   match(Set dst (ReplicateS con));
 4331   match(Set dst (ReplicateI con));
 4332   format %{ "replicateI $dst,$con" %}
 4333   ins_encode %{
 4334     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4335         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4336             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4337                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4338     BasicType bt = Matcher::vector_element_basic_type(this);
 4339     int vlen = Matcher::vector_length_in_bytes(this);
 4340     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4341   %}
 4342   ins_pipe( pipe_slow );
 4343 %}
 4344 
 4345 // Replicate scalar zero to be vector
 4346 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4347   match(Set dst (ReplicateB zero));
 4348   match(Set dst (ReplicateS zero));
 4349   match(Set dst (ReplicateI zero));
 4350   format %{ "replicateI $dst,$zero" %}
 4351   ins_encode %{
 4352     int vlen_enc = vector_length_encoding(this);
 4353     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4354       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4355     } else {
 4356       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4357     }
 4358   %}
 4359   ins_pipe( fpu_reg_reg );
 4360 %}
 4361 
 4362 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4363   predicate(UseSSE >= 2);
 4364   match(Set dst (ReplicateB con));
 4365   match(Set dst (ReplicateS con));
 4366   match(Set dst (ReplicateI con));
 4367   format %{ "vallones $dst" %}
 4368   ins_encode %{
 4369     int vector_len = vector_length_encoding(this);
 4370     __ vallones($dst$$XMMRegister, vector_len);
 4371   %}
 4372   ins_pipe( pipe_slow );
 4373 %}
 4374 
 4375 // ====================ReplicateL=======================================
 4376 
 4377 #ifdef _LP64
 4378 // Replicate long (8 byte) scalar to be vector
 4379 instruct ReplL_reg(vec dst, rRegL src) %{
 4380   match(Set dst (ReplicateL src));
 4381   format %{ "replicateL $dst,$src" %}
 4382   ins_encode %{
 4383     int vlen = Matcher::vector_length(this);
 4384     int vlen_enc = vector_length_encoding(this);
 4385     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4386       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4387     } else if (VM_Version::supports_avx2()) {
 4388       __ movdq($dst$$XMMRegister, $src$$Register);
 4389       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4390     } else {
 4391       __ movdq($dst$$XMMRegister, $src$$Register);
 4392       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4393     }
 4394   %}
 4395   ins_pipe( pipe_slow );
 4396 %}
 4397 #else // _LP64
 4398 // Replicate long (8 byte) scalar to be vector
 4399 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4400   predicate(Matcher::vector_length(n) <= 4);
 4401   match(Set dst (ReplicateL src));
 4402   effect(TEMP dst, USE src, TEMP tmp);
 4403   format %{ "replicateL $dst,$src" %}
 4404   ins_encode %{
 4405     uint vlen = Matcher::vector_length(this);
 4406     if (vlen == 2) {
 4407       __ movdl($dst$$XMMRegister, $src$$Register);
 4408       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4409       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4410       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4411     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4412       int vlen_enc = Assembler::AVX_256bit;
 4413       __ movdl($dst$$XMMRegister, $src$$Register);
 4414       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4415       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4416       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4417     } else {
 4418       __ movdl($dst$$XMMRegister, $src$$Register);
 4419       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4420       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4421       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4422       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4423     }
 4424   %}
 4425   ins_pipe( pipe_slow );
 4426 %}
 4427 
 4428 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4429   predicate(Matcher::vector_length(n) == 8);
 4430   match(Set dst (ReplicateL src));
 4431   effect(TEMP dst, USE src, TEMP tmp);
 4432   format %{ "replicateL $dst,$src" %}
 4433   ins_encode %{
 4434     if (VM_Version::supports_avx512vl()) {
 4435       __ movdl($dst$$XMMRegister, $src$$Register);
 4436       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4437       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4438       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4439       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4440       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4441     } else {
 4442       int vlen_enc = Assembler::AVX_512bit;
 4443       __ movdl($dst$$XMMRegister, $src$$Register);
 4444       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4445       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4446       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4447     }
 4448   %}
 4449   ins_pipe( pipe_slow );
 4450 %}
 4451 #endif // _LP64
 4452 
 4453 instruct ReplL_mem(vec dst, memory mem) %{
 4454   match(Set dst (ReplicateL (LoadL mem)));
 4455   format %{ "replicateL $dst,$mem" %}
 4456   ins_encode %{
 4457     int vlen_enc = vector_length_encoding(this);
 4458     if (VM_Version::supports_avx2()) {
 4459       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4460     } else if (VM_Version::supports_sse3()) {
 4461       __ movddup($dst$$XMMRegister, $mem$$Address);
 4462     } else {
 4463       __ movq($dst$$XMMRegister, $mem$$Address);
 4464       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4465     }
 4466   %}
 4467   ins_pipe( pipe_slow );
 4468 %}
 4469 
 4470 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4471 instruct ReplL_imm(vec dst, immL con) %{
 4472   match(Set dst (ReplicateL con));
 4473   format %{ "replicateL $dst,$con" %}
 4474   ins_encode %{
 4475     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4476     int vlen = Matcher::vector_length_in_bytes(this);
 4477     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4478   %}
 4479   ins_pipe( pipe_slow );
 4480 %}
 4481 
 4482 instruct ReplL_zero(vec dst, immL0 zero) %{
 4483   match(Set dst (ReplicateL zero));
 4484   format %{ "replicateL $dst,$zero" %}
 4485   ins_encode %{
 4486     int vlen_enc = vector_length_encoding(this);
 4487     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4488       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4489     } else {
 4490       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4491     }
 4492   %}
 4493   ins_pipe( fpu_reg_reg );
 4494 %}
 4495 
 4496 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4497   predicate(UseSSE >= 2);
 4498   match(Set dst (ReplicateL con));
 4499   format %{ "vallones $dst" %}
 4500   ins_encode %{
 4501     int vector_len = vector_length_encoding(this);
 4502     __ vallones($dst$$XMMRegister, vector_len);
 4503   %}
 4504   ins_pipe( pipe_slow );
 4505 %}
 4506 
 4507 // ====================ReplicateF=======================================
 4508 
 4509 instruct vReplF_reg(vec dst, vlRegF src) %{
 4510   predicate(UseAVX > 0);
 4511   match(Set dst (ReplicateF src));
 4512   format %{ "replicateF $dst,$src" %}
 4513   ins_encode %{
 4514     uint vlen = Matcher::vector_length(this);
 4515     int vlen_enc = vector_length_encoding(this);
 4516     if (vlen <= 4) {
 4517       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4518     } else if (VM_Version::supports_avx2()) {
 4519       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4520     } else {
 4521       assert(vlen == 8, "sanity");
 4522       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4523       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4524     }
 4525   %}
 4526   ins_pipe( pipe_slow );
 4527 %}
 4528 
 4529 instruct ReplF_reg(vec dst, vlRegF src) %{
 4530   predicate(UseAVX == 0);
 4531   match(Set dst (ReplicateF src));
 4532   format %{ "replicateF $dst,$src" %}
 4533   ins_encode %{
 4534     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4535   %}
 4536   ins_pipe( pipe_slow );
 4537 %}
 4538 
 4539 instruct ReplF_mem(vec dst, memory mem) %{
 4540   predicate(UseAVX > 0);
 4541   match(Set dst (ReplicateF (LoadF mem)));
 4542   format %{ "replicateF $dst,$mem" %}
 4543   ins_encode %{
 4544     int vlen_enc = vector_length_encoding(this);
 4545     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4546   %}
 4547   ins_pipe( pipe_slow );
 4548 %}
 4549 
 4550 // Replicate float scalar immediate to be vector by loading from const table.
 4551 instruct ReplF_imm(vec dst, immF con) %{
 4552   match(Set dst (ReplicateF con));
 4553   format %{ "replicateF $dst,$con" %}
 4554   ins_encode %{
 4555     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4556         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4557     int vlen = Matcher::vector_length_in_bytes(this);
 4558     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4559   %}
 4560   ins_pipe( pipe_slow );
 4561 %}
 4562 
 4563 instruct ReplF_zero(vec dst, immF0 zero) %{
 4564   match(Set dst (ReplicateF zero));
 4565   format %{ "replicateF $dst,$zero" %}
 4566   ins_encode %{
 4567     int vlen_enc = vector_length_encoding(this);
 4568     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4569       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4570     } else {
 4571       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4572     }
 4573   %}
 4574   ins_pipe( fpu_reg_reg );
 4575 %}
 4576 
 4577 // ====================ReplicateD=======================================
 4578 
 4579 // Replicate double (8 bytes) scalar to be vector
 4580 instruct vReplD_reg(vec dst, vlRegD src) %{
 4581   predicate(UseSSE >= 3);
 4582   match(Set dst (ReplicateD src));
 4583   format %{ "replicateD $dst,$src" %}
 4584   ins_encode %{
 4585     uint vlen = Matcher::vector_length(this);
 4586     int vlen_enc = vector_length_encoding(this);
 4587     if (vlen <= 2) {
 4588       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4589     } else if (VM_Version::supports_avx2()) {
 4590       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4591     } else {
 4592       assert(vlen == 4, "sanity");
 4593       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4594       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4595     }
 4596   %}
 4597   ins_pipe( pipe_slow );
 4598 %}
 4599 
 4600 instruct ReplD_reg(vec dst, vlRegD src) %{
 4601   predicate(UseSSE < 3);
 4602   match(Set dst (ReplicateD src));
 4603   format %{ "replicateD $dst,$src" %}
 4604   ins_encode %{
 4605     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4606   %}
 4607   ins_pipe( pipe_slow );
 4608 %}
 4609 
 4610 instruct ReplD_mem(vec dst, memory mem) %{
 4611   predicate(UseSSE >= 3);
 4612   match(Set dst (ReplicateD (LoadD mem)));
 4613   format %{ "replicateD $dst,$mem" %}
 4614   ins_encode %{
 4615     if (Matcher::vector_length(this) >= 4) {
 4616       int vlen_enc = vector_length_encoding(this);
 4617       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4618     } else {
 4619       __ movddup($dst$$XMMRegister, $mem$$Address);
 4620     }
 4621   %}
 4622   ins_pipe( pipe_slow );
 4623 %}
 4624 
 4625 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4626 instruct ReplD_imm(vec dst, immD con) %{
 4627   match(Set dst (ReplicateD con));
 4628   format %{ "replicateD $dst,$con" %}
 4629   ins_encode %{
 4630     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4631     int vlen = Matcher::vector_length_in_bytes(this);
 4632     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4633   %}
 4634   ins_pipe( pipe_slow );
 4635 %}
 4636 
 4637 instruct ReplD_zero(vec dst, immD0 zero) %{
 4638   match(Set dst (ReplicateD zero));
 4639   format %{ "replicateD $dst,$zero" %}
 4640   ins_encode %{
 4641     int vlen_enc = vector_length_encoding(this);
 4642     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4643       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4644     } else {
 4645       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4646     }
 4647   %}
 4648   ins_pipe( fpu_reg_reg );
 4649 %}
 4650 
 4651 // ====================VECTOR INSERT=======================================
 4652 
 4653 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4654   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4655   match(Set dst (VectorInsert (Binary dst val) idx));
 4656   format %{ "vector_insert $dst,$val,$idx" %}
 4657   ins_encode %{
 4658     assert(UseSSE >= 4, "required");
 4659     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4660 
 4661     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4662 
 4663     assert(is_integral_type(elem_bt), "");
 4664     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4665 
 4666     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4667   %}
 4668   ins_pipe( pipe_slow );
 4669 %}
 4670 
 4671 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4672   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4673   match(Set dst (VectorInsert (Binary src val) idx));
 4674   effect(TEMP vtmp);
 4675   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4676   ins_encode %{
 4677     int vlen_enc = Assembler::AVX_256bit;
 4678     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4679     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4680     int log2epr = log2(elem_per_lane);
 4681 
 4682     assert(is_integral_type(elem_bt), "sanity");
 4683     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4684 
 4685     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4686     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4687     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4688     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4689     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4690   %}
 4691   ins_pipe( pipe_slow );
 4692 %}
 4693 
 4694 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4695   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4696   match(Set dst (VectorInsert (Binary src val) idx));
 4697   effect(TEMP vtmp);
 4698   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4699   ins_encode %{
 4700     assert(UseAVX > 2, "sanity");
 4701 
 4702     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4703     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4704     int log2epr = log2(elem_per_lane);
 4705 
 4706     assert(is_integral_type(elem_bt), "");
 4707     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4708 
 4709     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4710     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4711     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4712     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4713     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4714   %}
 4715   ins_pipe( pipe_slow );
 4716 %}
 4717 
 4718 #ifdef _LP64
 4719 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4720   predicate(Matcher::vector_length(n) == 2);
 4721   match(Set dst (VectorInsert (Binary dst val) idx));
 4722   format %{ "vector_insert $dst,$val,$idx" %}
 4723   ins_encode %{
 4724     assert(UseSSE >= 4, "required");
 4725     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4726     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4727 
 4728     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4729   %}
 4730   ins_pipe( pipe_slow );
 4731 %}
 4732 
 4733 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4734   predicate(Matcher::vector_length(n) == 4);
 4735   match(Set dst (VectorInsert (Binary src val) idx));
 4736   effect(TEMP vtmp);
 4737   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4738   ins_encode %{
 4739     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4740     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4741 
 4742     uint x_idx = $idx$$constant & right_n_bits(1);
 4743     uint y_idx = ($idx$$constant >> 1) & 1;
 4744     int vlen_enc = Assembler::AVX_256bit;
 4745     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4746     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4747     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4748   %}
 4749   ins_pipe( pipe_slow );
 4750 %}
 4751 
 4752 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4753   predicate(Matcher::vector_length(n) == 8);
 4754   match(Set dst (VectorInsert (Binary src val) idx));
 4755   effect(TEMP vtmp);
 4756   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4757   ins_encode %{
 4758     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4759     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4760 
 4761     uint x_idx = $idx$$constant & right_n_bits(1);
 4762     uint y_idx = ($idx$$constant >> 1) & 3;
 4763     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4764     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4765     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4766   %}
 4767   ins_pipe( pipe_slow );
 4768 %}
 4769 #endif
 4770 
 4771 instruct insertF(vec dst, regF val, immU8 idx) %{
 4772   predicate(Matcher::vector_length(n) < 8);
 4773   match(Set dst (VectorInsert (Binary dst val) idx));
 4774   format %{ "vector_insert $dst,$val,$idx" %}
 4775   ins_encode %{
 4776     assert(UseSSE >= 4, "sanity");
 4777 
 4778     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4779     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4780 
 4781     uint x_idx = $idx$$constant & right_n_bits(2);
 4782     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4783   %}
 4784   ins_pipe( pipe_slow );
 4785 %}
 4786 
 4787 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4788   predicate(Matcher::vector_length(n) >= 8);
 4789   match(Set dst (VectorInsert (Binary src val) idx));
 4790   effect(TEMP vtmp);
 4791   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4792   ins_encode %{
 4793     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4794     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4795 
 4796     int vlen = Matcher::vector_length(this);
 4797     uint x_idx = $idx$$constant & right_n_bits(2);
 4798     if (vlen == 8) {
 4799       uint y_idx = ($idx$$constant >> 2) & 1;
 4800       int vlen_enc = Assembler::AVX_256bit;
 4801       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4802       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4803       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4804     } else {
 4805       assert(vlen == 16, "sanity");
 4806       uint y_idx = ($idx$$constant >> 2) & 3;
 4807       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4808       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4809       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4810     }
 4811   %}
 4812   ins_pipe( pipe_slow );
 4813 %}
 4814 
 4815 #ifdef _LP64
 4816 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4817   predicate(Matcher::vector_length(n) == 2);
 4818   match(Set dst (VectorInsert (Binary dst val) idx));
 4819   effect(TEMP tmp);
 4820   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4821   ins_encode %{
 4822     assert(UseSSE >= 4, "sanity");
 4823     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4824     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4825 
 4826     __ movq($tmp$$Register, $val$$XMMRegister);
 4827     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4828   %}
 4829   ins_pipe( pipe_slow );
 4830 %}
 4831 
 4832 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4833   predicate(Matcher::vector_length(n) == 4);
 4834   match(Set dst (VectorInsert (Binary src val) idx));
 4835   effect(TEMP vtmp, TEMP tmp);
 4836   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4837   ins_encode %{
 4838     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4839     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4840 
 4841     uint x_idx = $idx$$constant & right_n_bits(1);
 4842     uint y_idx = ($idx$$constant >> 1) & 1;
 4843     int vlen_enc = Assembler::AVX_256bit;
 4844     __ movq($tmp$$Register, $val$$XMMRegister);
 4845     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4846     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4847     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4848   %}
 4849   ins_pipe( pipe_slow );
 4850 %}
 4851 
 4852 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4853   predicate(Matcher::vector_length(n) == 8);
 4854   match(Set dst (VectorInsert (Binary src val) idx));
 4855   effect(TEMP tmp, TEMP vtmp);
 4856   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4857   ins_encode %{
 4858     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4859     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4860 
 4861     uint x_idx = $idx$$constant & right_n_bits(1);
 4862     uint y_idx = ($idx$$constant >> 1) & 3;
 4863     __ movq($tmp$$Register, $val$$XMMRegister);
 4864     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4865     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4866     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4867   %}
 4868   ins_pipe( pipe_slow );
 4869 %}
 4870 #endif
 4871 
 4872 // ====================REDUCTION ARITHMETIC=======================================
 4873 
 4874 // =======================Int Reduction==========================================
 4875 
 4876 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4877   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4878   match(Set dst (AddReductionVI src1 src2));
 4879   match(Set dst (MulReductionVI src1 src2));
 4880   match(Set dst (AndReductionV  src1 src2));
 4881   match(Set dst ( OrReductionV  src1 src2));
 4882   match(Set dst (XorReductionV  src1 src2));
 4883   match(Set dst (MinReductionV  src1 src2));
 4884   match(Set dst (MaxReductionV  src1 src2));
 4885   effect(TEMP vtmp1, TEMP vtmp2);
 4886   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4887   ins_encode %{
 4888     int opcode = this->ideal_Opcode();
 4889     int vlen = Matcher::vector_length(this, $src2);
 4890     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4891   %}
 4892   ins_pipe( pipe_slow );
 4893 %}
 4894 
 4895 // =======================Long Reduction==========================================
 4896 
 4897 #ifdef _LP64
 4898 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4899   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4900   match(Set dst (AddReductionVL src1 src2));
 4901   match(Set dst (MulReductionVL src1 src2));
 4902   match(Set dst (AndReductionV  src1 src2));
 4903   match(Set dst ( OrReductionV  src1 src2));
 4904   match(Set dst (XorReductionV  src1 src2));
 4905   match(Set dst (MinReductionV  src1 src2));
 4906   match(Set dst (MaxReductionV  src1 src2));
 4907   effect(TEMP vtmp1, TEMP vtmp2);
 4908   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4909   ins_encode %{
 4910     int opcode = this->ideal_Opcode();
 4911     int vlen = Matcher::vector_length(this, $src2);
 4912     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4913   %}
 4914   ins_pipe( pipe_slow );
 4915 %}
 4916 
 4917 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4918   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4919   match(Set dst (AddReductionVL src1 src2));
 4920   match(Set dst (MulReductionVL src1 src2));
 4921   match(Set dst (AndReductionV  src1 src2));
 4922   match(Set dst ( OrReductionV  src1 src2));
 4923   match(Set dst (XorReductionV  src1 src2));
 4924   match(Set dst (MinReductionV  src1 src2));
 4925   match(Set dst (MaxReductionV  src1 src2));
 4926   effect(TEMP vtmp1, TEMP vtmp2);
 4927   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4928   ins_encode %{
 4929     int opcode = this->ideal_Opcode();
 4930     int vlen = Matcher::vector_length(this, $src2);
 4931     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4932   %}
 4933   ins_pipe( pipe_slow );
 4934 %}
 4935 #endif // _LP64
 4936 
 4937 // =======================Float Reduction==========================================
 4938 
 4939 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4940   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4941   match(Set dst (AddReductionVF dst src));
 4942   match(Set dst (MulReductionVF dst src));
 4943   effect(TEMP dst, TEMP vtmp);
 4944   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4945   ins_encode %{
 4946     int opcode = this->ideal_Opcode();
 4947     int vlen = Matcher::vector_length(this, $src);
 4948     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4949   %}
 4950   ins_pipe( pipe_slow );
 4951 %}
 4952 
 4953 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4954   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4955   match(Set dst (AddReductionVF dst src));
 4956   match(Set dst (MulReductionVF dst src));
 4957   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4958   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4959   ins_encode %{
 4960     int opcode = this->ideal_Opcode();
 4961     int vlen = Matcher::vector_length(this, $src);
 4962     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4963   %}
 4964   ins_pipe( pipe_slow );
 4965 %}
 4966 
 4967 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4968   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4969   match(Set dst (AddReductionVF dst src));
 4970   match(Set dst (MulReductionVF dst src));
 4971   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4972   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4973   ins_encode %{
 4974     int opcode = this->ideal_Opcode();
 4975     int vlen = Matcher::vector_length(this, $src);
 4976     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4977   %}
 4978   ins_pipe( pipe_slow );
 4979 %}
 4980 
 4981 // =======================Double Reduction==========================================
 4982 
 4983 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4984   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 4985   match(Set dst (AddReductionVD dst src));
 4986   match(Set dst (MulReductionVD dst src));
 4987   effect(TEMP dst, TEMP vtmp);
 4988   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 4989   ins_encode %{
 4990     int opcode = this->ideal_Opcode();
 4991     int vlen = Matcher::vector_length(this, $src);
 4992     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4993 %}
 4994   ins_pipe( pipe_slow );
 4995 %}
 4996 
 4997 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 4998   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 4999   match(Set dst (AddReductionVD dst src));
 5000   match(Set dst (MulReductionVD dst src));
 5001   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5002   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5003   ins_encode %{
 5004     int opcode = this->ideal_Opcode();
 5005     int vlen = Matcher::vector_length(this, $src);
 5006     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5007   %}
 5008   ins_pipe( pipe_slow );
 5009 %}
 5010 
 5011 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5012   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 5013   match(Set dst (AddReductionVD dst src));
 5014   match(Set dst (MulReductionVD dst src));
 5015   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5016   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5017   ins_encode %{
 5018     int opcode = this->ideal_Opcode();
 5019     int vlen = Matcher::vector_length(this, $src);
 5020     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5021   %}
 5022   ins_pipe( pipe_slow );
 5023 %}
 5024 
 5025 // =======================Byte Reduction==========================================
 5026 
 5027 #ifdef _LP64
 5028 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5029   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5030   match(Set dst (AddReductionVI src1 src2));
 5031   match(Set dst (AndReductionV  src1 src2));
 5032   match(Set dst ( OrReductionV  src1 src2));
 5033   match(Set dst (XorReductionV  src1 src2));
 5034   match(Set dst (MinReductionV  src1 src2));
 5035   match(Set dst (MaxReductionV  src1 src2));
 5036   effect(TEMP vtmp1, TEMP vtmp2);
 5037   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5038   ins_encode %{
 5039     int opcode = this->ideal_Opcode();
 5040     int vlen = Matcher::vector_length(this, $src2);
 5041     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5042   %}
 5043   ins_pipe( pipe_slow );
 5044 %}
 5045 
 5046 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5047   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5048   match(Set dst (AddReductionVI src1 src2));
 5049   match(Set dst (AndReductionV  src1 src2));
 5050   match(Set dst ( OrReductionV  src1 src2));
 5051   match(Set dst (XorReductionV  src1 src2));
 5052   match(Set dst (MinReductionV  src1 src2));
 5053   match(Set dst (MaxReductionV  src1 src2));
 5054   effect(TEMP vtmp1, TEMP vtmp2);
 5055   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5056   ins_encode %{
 5057     int opcode = this->ideal_Opcode();
 5058     int vlen = Matcher::vector_length(this, $src2);
 5059     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5060   %}
 5061   ins_pipe( pipe_slow );
 5062 %}
 5063 #endif
 5064 
 5065 // =======================Short Reduction==========================================
 5066 
 5067 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5068   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5069   match(Set dst (AddReductionVI src1 src2));
 5070   match(Set dst (MulReductionVI src1 src2));
 5071   match(Set dst (AndReductionV  src1 src2));
 5072   match(Set dst ( OrReductionV  src1 src2));
 5073   match(Set dst (XorReductionV  src1 src2));
 5074   match(Set dst (MinReductionV  src1 src2));
 5075   match(Set dst (MaxReductionV  src1 src2));
 5076   effect(TEMP vtmp1, TEMP vtmp2);
 5077   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5078   ins_encode %{
 5079     int opcode = this->ideal_Opcode();
 5080     int vlen = Matcher::vector_length(this, $src2);
 5081     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5082   %}
 5083   ins_pipe( pipe_slow );
 5084 %}
 5085 
 5086 // =======================Mul Reduction==========================================
 5087 
 5088 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5089   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5090             Matcher::vector_length(n->in(2)) <= 32); // src2
 5091   match(Set dst (MulReductionVI src1 src2));
 5092   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5093   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5094   ins_encode %{
 5095     int opcode = this->ideal_Opcode();
 5096     int vlen = Matcher::vector_length(this, $src2);
 5097     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5098   %}
 5099   ins_pipe( pipe_slow );
 5100 %}
 5101 
 5102 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5103   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5104             Matcher::vector_length(n->in(2)) == 64); // src2
 5105   match(Set dst (MulReductionVI src1 src2));
 5106   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5107   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5108   ins_encode %{
 5109     int opcode = this->ideal_Opcode();
 5110     int vlen = Matcher::vector_length(this, $src2);
 5111     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5112   %}
 5113   ins_pipe( pipe_slow );
 5114 %}
 5115 
 5116 //--------------------Min/Max Float Reduction --------------------
 5117 // Float Min Reduction
 5118 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5119                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5120   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5121             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5122              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5123             Matcher::vector_length(n->in(2)) == 2);
 5124   match(Set dst (MinReductionV src1 src2));
 5125   match(Set dst (MaxReductionV src1 src2));
 5126   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5127   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5128   ins_encode %{
 5129     assert(UseAVX > 0, "sanity");
 5130 
 5131     int opcode = this->ideal_Opcode();
 5132     int vlen = Matcher::vector_length(this, $src2);
 5133     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5134                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5135   %}
 5136   ins_pipe( pipe_slow );
 5137 %}
 5138 
 5139 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5140                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5141   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5142             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5143              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5144             Matcher::vector_length(n->in(2)) >= 4);
 5145   match(Set dst (MinReductionV src1 src2));
 5146   match(Set dst (MaxReductionV src1 src2));
 5147   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5148   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5149   ins_encode %{
 5150     assert(UseAVX > 0, "sanity");
 5151 
 5152     int opcode = this->ideal_Opcode();
 5153     int vlen = Matcher::vector_length(this, $src2);
 5154     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5155                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5156   %}
 5157   ins_pipe( pipe_slow );
 5158 %}
 5159 
 5160 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5161                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5162   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5163             Matcher::vector_length(n->in(2)) == 2);
 5164   match(Set dst (MinReductionV dst src));
 5165   match(Set dst (MaxReductionV dst src));
 5166   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5167   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5168   ins_encode %{
 5169     assert(UseAVX > 0, "sanity");
 5170 
 5171     int opcode = this->ideal_Opcode();
 5172     int vlen = Matcher::vector_length(this, $src);
 5173     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5174                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5175   %}
 5176   ins_pipe( pipe_slow );
 5177 %}
 5178 
 5179 
 5180 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5181                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5182   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5183             Matcher::vector_length(n->in(2)) >= 4);
 5184   match(Set dst (MinReductionV dst src));
 5185   match(Set dst (MaxReductionV dst src));
 5186   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5187   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5188   ins_encode %{
 5189     assert(UseAVX > 0, "sanity");
 5190 
 5191     int opcode = this->ideal_Opcode();
 5192     int vlen = Matcher::vector_length(this, $src);
 5193     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5194                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5195   %}
 5196   ins_pipe( pipe_slow );
 5197 %}
 5198 
 5199 
 5200 //--------------------Min Double Reduction --------------------
 5201 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5202                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5203                             rFlagsReg cr) %{
 5204   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5205             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5206              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5207             Matcher::vector_length(n->in(2)) == 2);
 5208   match(Set dst (MinReductionV src1 src2));
 5209   match(Set dst (MaxReductionV src1 src2));
 5210   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5211   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5212   ins_encode %{
 5213     assert(UseAVX > 0, "sanity");
 5214 
 5215     int opcode = this->ideal_Opcode();
 5216     int vlen = Matcher::vector_length(this, $src2);
 5217     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5218                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5219   %}
 5220   ins_pipe( pipe_slow );
 5221 %}
 5222 
 5223 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5224                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5225                            rFlagsReg cr) %{
 5226   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5227             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5228              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5229             Matcher::vector_length(n->in(2)) >= 4);
 5230   match(Set dst (MinReductionV src1 src2));
 5231   match(Set dst (MaxReductionV src1 src2));
 5232   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5233   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5234   ins_encode %{
 5235     assert(UseAVX > 0, "sanity");
 5236 
 5237     int opcode = this->ideal_Opcode();
 5238     int vlen = Matcher::vector_length(this, $src2);
 5239     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5240                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5241   %}
 5242   ins_pipe( pipe_slow );
 5243 %}
 5244 
 5245 
 5246 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5247                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5248                                rFlagsReg cr) %{
 5249   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5250             Matcher::vector_length(n->in(2)) == 2);
 5251   match(Set dst (MinReductionV dst src));
 5252   match(Set dst (MaxReductionV dst src));
 5253   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5254   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5255   ins_encode %{
 5256     assert(UseAVX > 0, "sanity");
 5257 
 5258     int opcode = this->ideal_Opcode();
 5259     int vlen = Matcher::vector_length(this, $src);
 5260     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5261                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5262   %}
 5263   ins_pipe( pipe_slow );
 5264 %}
 5265 
 5266 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5267                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5268                               rFlagsReg cr) %{
 5269   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5270             Matcher::vector_length(n->in(2)) >= 4);
 5271   match(Set dst (MinReductionV dst src));
 5272   match(Set dst (MaxReductionV dst src));
 5273   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5274   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5275   ins_encode %{
 5276     assert(UseAVX > 0, "sanity");
 5277 
 5278     int opcode = this->ideal_Opcode();
 5279     int vlen = Matcher::vector_length(this, $src);
 5280     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5281                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5282   %}
 5283   ins_pipe( pipe_slow );
 5284 %}
 5285 
 5286 // ====================VECTOR ARITHMETIC=======================================
 5287 
 5288 // --------------------------------- ADD --------------------------------------
 5289 
 5290 // Bytes vector add
 5291 instruct vaddB(vec dst, vec src) %{
 5292   predicate(UseAVX == 0);
 5293   match(Set dst (AddVB dst src));
 5294   format %{ "paddb   $dst,$src\t! add packedB" %}
 5295   ins_encode %{
 5296     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5297   %}
 5298   ins_pipe( pipe_slow );
 5299 %}
 5300 
 5301 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5302   predicate(UseAVX > 0);
 5303   match(Set dst (AddVB src1 src2));
 5304   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5305   ins_encode %{
 5306     int vlen_enc = vector_length_encoding(this);
 5307     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5308   %}
 5309   ins_pipe( pipe_slow );
 5310 %}
 5311 
 5312 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5313   predicate((UseAVX > 0) &&
 5314             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5315   match(Set dst (AddVB src (LoadVector mem)));
 5316   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5317   ins_encode %{
 5318     int vlen_enc = vector_length_encoding(this);
 5319     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5320   %}
 5321   ins_pipe( pipe_slow );
 5322 %}
 5323 
 5324 // Shorts/Chars vector add
 5325 instruct vaddS(vec dst, vec src) %{
 5326   predicate(UseAVX == 0);
 5327   match(Set dst (AddVS dst src));
 5328   format %{ "paddw   $dst,$src\t! add packedS" %}
 5329   ins_encode %{
 5330     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5331   %}
 5332   ins_pipe( pipe_slow );
 5333 %}
 5334 
 5335 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5336   predicate(UseAVX > 0);
 5337   match(Set dst (AddVS src1 src2));
 5338   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5339   ins_encode %{
 5340     int vlen_enc = vector_length_encoding(this);
 5341     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5342   %}
 5343   ins_pipe( pipe_slow );
 5344 %}
 5345 
 5346 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5347   predicate((UseAVX > 0) &&
 5348             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5349   match(Set dst (AddVS src (LoadVector mem)));
 5350   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5351   ins_encode %{
 5352     int vlen_enc = vector_length_encoding(this);
 5353     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5354   %}
 5355   ins_pipe( pipe_slow );
 5356 %}
 5357 
 5358 // Integers vector add
 5359 instruct vaddI(vec dst, vec src) %{
 5360   predicate(UseAVX == 0);
 5361   match(Set dst (AddVI dst src));
 5362   format %{ "paddd   $dst,$src\t! add packedI" %}
 5363   ins_encode %{
 5364     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5365   %}
 5366   ins_pipe( pipe_slow );
 5367 %}
 5368 
 5369 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5370   predicate(UseAVX > 0);
 5371   match(Set dst (AddVI src1 src2));
 5372   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5373   ins_encode %{
 5374     int vlen_enc = vector_length_encoding(this);
 5375     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5376   %}
 5377   ins_pipe( pipe_slow );
 5378 %}
 5379 
 5380 
 5381 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5382   predicate((UseAVX > 0) &&
 5383             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5384   match(Set dst (AddVI src (LoadVector mem)));
 5385   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5386   ins_encode %{
 5387     int vlen_enc = vector_length_encoding(this);
 5388     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5389   %}
 5390   ins_pipe( pipe_slow );
 5391 %}
 5392 
 5393 // Longs vector add
 5394 instruct vaddL(vec dst, vec src) %{
 5395   predicate(UseAVX == 0);
 5396   match(Set dst (AddVL dst src));
 5397   format %{ "paddq   $dst,$src\t! add packedL" %}
 5398   ins_encode %{
 5399     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5400   %}
 5401   ins_pipe( pipe_slow );
 5402 %}
 5403 
 5404 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5405   predicate(UseAVX > 0);
 5406   match(Set dst (AddVL src1 src2));
 5407   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5408   ins_encode %{
 5409     int vlen_enc = vector_length_encoding(this);
 5410     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5411   %}
 5412   ins_pipe( pipe_slow );
 5413 %}
 5414 
 5415 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5416   predicate((UseAVX > 0) &&
 5417             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5418   match(Set dst (AddVL src (LoadVector mem)));
 5419   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5420   ins_encode %{
 5421     int vlen_enc = vector_length_encoding(this);
 5422     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5423   %}
 5424   ins_pipe( pipe_slow );
 5425 %}
 5426 
 5427 // Floats vector add
 5428 instruct vaddF(vec dst, vec src) %{
 5429   predicate(UseAVX == 0);
 5430   match(Set dst (AddVF dst src));
 5431   format %{ "addps   $dst,$src\t! add packedF" %}
 5432   ins_encode %{
 5433     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5434   %}
 5435   ins_pipe( pipe_slow );
 5436 %}
 5437 
 5438 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5439   predicate(UseAVX > 0);
 5440   match(Set dst (AddVF src1 src2));
 5441   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5442   ins_encode %{
 5443     int vlen_enc = vector_length_encoding(this);
 5444     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5445   %}
 5446   ins_pipe( pipe_slow );
 5447 %}
 5448 
 5449 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5450   predicate((UseAVX > 0) &&
 5451             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5452   match(Set dst (AddVF src (LoadVector mem)));
 5453   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5454   ins_encode %{
 5455     int vlen_enc = vector_length_encoding(this);
 5456     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5457   %}
 5458   ins_pipe( pipe_slow );
 5459 %}
 5460 
 5461 // Doubles vector add
 5462 instruct vaddD(vec dst, vec src) %{
 5463   predicate(UseAVX == 0);
 5464   match(Set dst (AddVD dst src));
 5465   format %{ "addpd   $dst,$src\t! add packedD" %}
 5466   ins_encode %{
 5467     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5468   %}
 5469   ins_pipe( pipe_slow );
 5470 %}
 5471 
 5472 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5473   predicate(UseAVX > 0);
 5474   match(Set dst (AddVD src1 src2));
 5475   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5476   ins_encode %{
 5477     int vlen_enc = vector_length_encoding(this);
 5478     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5479   %}
 5480   ins_pipe( pipe_slow );
 5481 %}
 5482 
 5483 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5484   predicate((UseAVX > 0) &&
 5485             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5486   match(Set dst (AddVD src (LoadVector mem)));
 5487   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5488   ins_encode %{
 5489     int vlen_enc = vector_length_encoding(this);
 5490     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5491   %}
 5492   ins_pipe( pipe_slow );
 5493 %}
 5494 
 5495 // --------------------------------- SUB --------------------------------------
 5496 
 5497 // Bytes vector sub
 5498 instruct vsubB(vec dst, vec src) %{
 5499   predicate(UseAVX == 0);
 5500   match(Set dst (SubVB dst src));
 5501   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5502   ins_encode %{
 5503     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5504   %}
 5505   ins_pipe( pipe_slow );
 5506 %}
 5507 
 5508 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5509   predicate(UseAVX > 0);
 5510   match(Set dst (SubVB src1 src2));
 5511   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5512   ins_encode %{
 5513     int vlen_enc = vector_length_encoding(this);
 5514     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5515   %}
 5516   ins_pipe( pipe_slow );
 5517 %}
 5518 
 5519 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5520   predicate((UseAVX > 0) &&
 5521             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5522   match(Set dst (SubVB src (LoadVector mem)));
 5523   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5524   ins_encode %{
 5525     int vlen_enc = vector_length_encoding(this);
 5526     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5527   %}
 5528   ins_pipe( pipe_slow );
 5529 %}
 5530 
 5531 // Shorts/Chars vector sub
 5532 instruct vsubS(vec dst, vec src) %{
 5533   predicate(UseAVX == 0);
 5534   match(Set dst (SubVS dst src));
 5535   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5536   ins_encode %{
 5537     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5538   %}
 5539   ins_pipe( pipe_slow );
 5540 %}
 5541 
 5542 
 5543 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5544   predicate(UseAVX > 0);
 5545   match(Set dst (SubVS src1 src2));
 5546   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5547   ins_encode %{
 5548     int vlen_enc = vector_length_encoding(this);
 5549     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5550   %}
 5551   ins_pipe( pipe_slow );
 5552 %}
 5553 
 5554 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5555   predicate((UseAVX > 0) &&
 5556             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5557   match(Set dst (SubVS src (LoadVector mem)));
 5558   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5559   ins_encode %{
 5560     int vlen_enc = vector_length_encoding(this);
 5561     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5562   %}
 5563   ins_pipe( pipe_slow );
 5564 %}
 5565 
 5566 // Integers vector sub
 5567 instruct vsubI(vec dst, vec src) %{
 5568   predicate(UseAVX == 0);
 5569   match(Set dst (SubVI dst src));
 5570   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5571   ins_encode %{
 5572     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5573   %}
 5574   ins_pipe( pipe_slow );
 5575 %}
 5576 
 5577 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5578   predicate(UseAVX > 0);
 5579   match(Set dst (SubVI src1 src2));
 5580   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5581   ins_encode %{
 5582     int vlen_enc = vector_length_encoding(this);
 5583     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5584   %}
 5585   ins_pipe( pipe_slow );
 5586 %}
 5587 
 5588 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5589   predicate((UseAVX > 0) &&
 5590             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5591   match(Set dst (SubVI src (LoadVector mem)));
 5592   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5593   ins_encode %{
 5594     int vlen_enc = vector_length_encoding(this);
 5595     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5596   %}
 5597   ins_pipe( pipe_slow );
 5598 %}
 5599 
 5600 // Longs vector sub
 5601 instruct vsubL(vec dst, vec src) %{
 5602   predicate(UseAVX == 0);
 5603   match(Set dst (SubVL dst src));
 5604   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5605   ins_encode %{
 5606     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5607   %}
 5608   ins_pipe( pipe_slow );
 5609 %}
 5610 
 5611 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5612   predicate(UseAVX > 0);
 5613   match(Set dst (SubVL src1 src2));
 5614   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5615   ins_encode %{
 5616     int vlen_enc = vector_length_encoding(this);
 5617     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5618   %}
 5619   ins_pipe( pipe_slow );
 5620 %}
 5621 
 5622 
 5623 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5624   predicate((UseAVX > 0) &&
 5625             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5626   match(Set dst (SubVL src (LoadVector mem)));
 5627   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5628   ins_encode %{
 5629     int vlen_enc = vector_length_encoding(this);
 5630     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5631   %}
 5632   ins_pipe( pipe_slow );
 5633 %}
 5634 
 5635 // Floats vector sub
 5636 instruct vsubF(vec dst, vec src) %{
 5637   predicate(UseAVX == 0);
 5638   match(Set dst (SubVF dst src));
 5639   format %{ "subps   $dst,$src\t! sub packedF" %}
 5640   ins_encode %{
 5641     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5642   %}
 5643   ins_pipe( pipe_slow );
 5644 %}
 5645 
 5646 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5647   predicate(UseAVX > 0);
 5648   match(Set dst (SubVF src1 src2));
 5649   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5650   ins_encode %{
 5651     int vlen_enc = vector_length_encoding(this);
 5652     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5653   %}
 5654   ins_pipe( pipe_slow );
 5655 %}
 5656 
 5657 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5658   predicate((UseAVX > 0) &&
 5659             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5660   match(Set dst (SubVF src (LoadVector mem)));
 5661   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5662   ins_encode %{
 5663     int vlen_enc = vector_length_encoding(this);
 5664     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5665   %}
 5666   ins_pipe( pipe_slow );
 5667 %}
 5668 
 5669 // Doubles vector sub
 5670 instruct vsubD(vec dst, vec src) %{
 5671   predicate(UseAVX == 0);
 5672   match(Set dst (SubVD dst src));
 5673   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5674   ins_encode %{
 5675     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5676   %}
 5677   ins_pipe( pipe_slow );
 5678 %}
 5679 
 5680 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5681   predicate(UseAVX > 0);
 5682   match(Set dst (SubVD src1 src2));
 5683   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5684   ins_encode %{
 5685     int vlen_enc = vector_length_encoding(this);
 5686     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5687   %}
 5688   ins_pipe( pipe_slow );
 5689 %}
 5690 
 5691 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5692   predicate((UseAVX > 0) &&
 5693             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5694   match(Set dst (SubVD src (LoadVector mem)));
 5695   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5696   ins_encode %{
 5697     int vlen_enc = vector_length_encoding(this);
 5698     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5699   %}
 5700   ins_pipe( pipe_slow );
 5701 %}
 5702 
 5703 // --------------------------------- MUL --------------------------------------
 5704 
 5705 // Byte vector mul
 5706 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5707   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5708   match(Set dst (MulVB src1 src2));
 5709   effect(TEMP dst, TEMP xtmp);
 5710   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5711   ins_encode %{
 5712     assert(UseSSE > 3, "required");
 5713     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5714     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5715     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5716     __ psllw($dst$$XMMRegister, 8);
 5717     __ psrlw($dst$$XMMRegister, 8);
 5718     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5719   %}
 5720   ins_pipe( pipe_slow );
 5721 %}
 5722 
 5723 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5724   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5725   match(Set dst (MulVB src1 src2));
 5726   effect(TEMP dst, TEMP xtmp);
 5727   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5728   ins_encode %{
 5729     assert(UseSSE > 3, "required");
 5730     // Odd-index elements
 5731     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5732     __ psrlw($dst$$XMMRegister, 8);
 5733     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5734     __ psrlw($xtmp$$XMMRegister, 8);
 5735     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5736     __ psllw($dst$$XMMRegister, 8);
 5737     // Even-index elements
 5738     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5739     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5740     __ psllw($xtmp$$XMMRegister, 8);
 5741     __ psrlw($xtmp$$XMMRegister, 8);
 5742     // Combine
 5743     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5744   %}
 5745   ins_pipe( pipe_slow );
 5746 %}
 5747 
 5748 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5749   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5750   match(Set dst (MulVB src1 src2));
 5751   effect(TEMP xtmp1, TEMP xtmp2);
 5752   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5753   ins_encode %{
 5754     int vlen_enc = vector_length_encoding(this);
 5755     // Odd-index elements
 5756     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5757     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5758     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5759     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5760     // Even-index elements
 5761     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5762     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5763     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5764     // Combine
 5765     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5766   %}
 5767   ins_pipe( pipe_slow );
 5768 %}
 5769 
 5770 // Shorts/Chars vector mul
 5771 instruct vmulS(vec dst, vec src) %{
 5772   predicate(UseAVX == 0);
 5773   match(Set dst (MulVS dst src));
 5774   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5775   ins_encode %{
 5776     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5777   %}
 5778   ins_pipe( pipe_slow );
 5779 %}
 5780 
 5781 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5782   predicate(UseAVX > 0);
 5783   match(Set dst (MulVS src1 src2));
 5784   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5785   ins_encode %{
 5786     int vlen_enc = vector_length_encoding(this);
 5787     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5788   %}
 5789   ins_pipe( pipe_slow );
 5790 %}
 5791 
 5792 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5793   predicate((UseAVX > 0) &&
 5794             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5795   match(Set dst (MulVS src (LoadVector mem)));
 5796   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5797   ins_encode %{
 5798     int vlen_enc = vector_length_encoding(this);
 5799     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5800   %}
 5801   ins_pipe( pipe_slow );
 5802 %}
 5803 
 5804 // Integers vector mul
 5805 instruct vmulI(vec dst, vec src) %{
 5806   predicate(UseAVX == 0);
 5807   match(Set dst (MulVI dst src));
 5808   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5809   ins_encode %{
 5810     assert(UseSSE > 3, "required");
 5811     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5812   %}
 5813   ins_pipe( pipe_slow );
 5814 %}
 5815 
 5816 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5817   predicate(UseAVX > 0);
 5818   match(Set dst (MulVI src1 src2));
 5819   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5820   ins_encode %{
 5821     int vlen_enc = vector_length_encoding(this);
 5822     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5823   %}
 5824   ins_pipe( pipe_slow );
 5825 %}
 5826 
 5827 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5828   predicate((UseAVX > 0) &&
 5829             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5830   match(Set dst (MulVI src (LoadVector mem)));
 5831   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5832   ins_encode %{
 5833     int vlen_enc = vector_length_encoding(this);
 5834     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5835   %}
 5836   ins_pipe( pipe_slow );
 5837 %}
 5838 
 5839 // Longs vector mul
 5840 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 5841   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5842              VM_Version::supports_avx512dq()) ||
 5843             VM_Version::supports_avx512vldq());
 5844   match(Set dst (MulVL src1 src2));
 5845   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 5846   ins_encode %{
 5847     assert(UseAVX > 2, "required");
 5848     int vlen_enc = vector_length_encoding(this);
 5849     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5850   %}
 5851   ins_pipe( pipe_slow );
 5852 %}
 5853 
 5854 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 5855   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5856              VM_Version::supports_avx512dq()) ||
 5857             (Matcher::vector_length_in_bytes(n) > 8 &&
 5858              VM_Version::supports_avx512vldq()));
 5859   match(Set dst (MulVL src (LoadVector mem)));
 5860   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 5861   ins_encode %{
 5862     assert(UseAVX > 2, "required");
 5863     int vlen_enc = vector_length_encoding(this);
 5864     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5865   %}
 5866   ins_pipe( pipe_slow );
 5867 %}
 5868 
 5869 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 5870   predicate(UseAVX == 0);
 5871   match(Set dst (MulVL src1 src2));
 5872   effect(TEMP dst, TEMP xtmp);
 5873   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5874   ins_encode %{
 5875     assert(VM_Version::supports_sse4_1(), "required");
 5876     // Get the lo-hi products, only the lower 32 bits is in concerns
 5877     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 5878     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 5879     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 5880     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 5881     __ psllq($dst$$XMMRegister, 32);
 5882     // Get the lo-lo products
 5883     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5884     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 5885     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 5886   %}
 5887   ins_pipe( pipe_slow );
 5888 %}
 5889 
 5890 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5891   predicate(UseAVX > 0 &&
 5892             ((Matcher::vector_length_in_bytes(n) == 64 &&
 5893               !VM_Version::supports_avx512dq()) ||
 5894              (Matcher::vector_length_in_bytes(n) < 64 &&
 5895               !VM_Version::supports_avx512vldq())));
 5896   match(Set dst (MulVL src1 src2));
 5897   effect(TEMP xtmp1, TEMP xtmp2);
 5898   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5899   ins_encode %{
 5900     int vlen_enc = vector_length_encoding(this);
 5901     // Get the lo-hi products, only the lower 32 bits is in concerns
 5902     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 5903     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5904     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 5905     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5906     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 5907     // Get the lo-lo products
 5908     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5909     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5910   %}
 5911   ins_pipe( pipe_slow );
 5912 %}
 5913 
 5914 // Floats vector mul
 5915 instruct vmulF(vec dst, vec src) %{
 5916   predicate(UseAVX == 0);
 5917   match(Set dst (MulVF dst src));
 5918   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5919   ins_encode %{
 5920     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5921   %}
 5922   ins_pipe( pipe_slow );
 5923 %}
 5924 
 5925 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5926   predicate(UseAVX > 0);
 5927   match(Set dst (MulVF src1 src2));
 5928   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5929   ins_encode %{
 5930     int vlen_enc = vector_length_encoding(this);
 5931     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5932   %}
 5933   ins_pipe( pipe_slow );
 5934 %}
 5935 
 5936 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5937   predicate((UseAVX > 0) &&
 5938             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5939   match(Set dst (MulVF src (LoadVector mem)));
 5940   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5941   ins_encode %{
 5942     int vlen_enc = vector_length_encoding(this);
 5943     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5944   %}
 5945   ins_pipe( pipe_slow );
 5946 %}
 5947 
 5948 // Doubles vector mul
 5949 instruct vmulD(vec dst, vec src) %{
 5950   predicate(UseAVX == 0);
 5951   match(Set dst (MulVD dst src));
 5952   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5953   ins_encode %{
 5954     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5955   %}
 5956   ins_pipe( pipe_slow );
 5957 %}
 5958 
 5959 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5960   predicate(UseAVX > 0);
 5961   match(Set dst (MulVD src1 src2));
 5962   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5963   ins_encode %{
 5964     int vlen_enc = vector_length_encoding(this);
 5965     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5966   %}
 5967   ins_pipe( pipe_slow );
 5968 %}
 5969 
 5970 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5971   predicate((UseAVX > 0) &&
 5972             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5973   match(Set dst (MulVD src (LoadVector mem)));
 5974   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 5975   ins_encode %{
 5976     int vlen_enc = vector_length_encoding(this);
 5977     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5978   %}
 5979   ins_pipe( pipe_slow );
 5980 %}
 5981 
 5982 // --------------------------------- DIV --------------------------------------
 5983 
 5984 // Floats vector div
 5985 instruct vdivF(vec dst, vec src) %{
 5986   predicate(UseAVX == 0);
 5987   match(Set dst (DivVF dst src));
 5988   format %{ "divps   $dst,$src\t! div packedF" %}
 5989   ins_encode %{
 5990     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 5991   %}
 5992   ins_pipe( pipe_slow );
 5993 %}
 5994 
 5995 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 5996   predicate(UseAVX > 0);
 5997   match(Set dst (DivVF src1 src2));
 5998   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 5999   ins_encode %{
 6000     int vlen_enc = vector_length_encoding(this);
 6001     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6002   %}
 6003   ins_pipe( pipe_slow );
 6004 %}
 6005 
 6006 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6007   predicate((UseAVX > 0) &&
 6008             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6009   match(Set dst (DivVF src (LoadVector mem)));
 6010   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6011   ins_encode %{
 6012     int vlen_enc = vector_length_encoding(this);
 6013     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6014   %}
 6015   ins_pipe( pipe_slow );
 6016 %}
 6017 
 6018 // Doubles vector div
 6019 instruct vdivD(vec dst, vec src) %{
 6020   predicate(UseAVX == 0);
 6021   match(Set dst (DivVD dst src));
 6022   format %{ "divpd   $dst,$src\t! div packedD" %}
 6023   ins_encode %{
 6024     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6025   %}
 6026   ins_pipe( pipe_slow );
 6027 %}
 6028 
 6029 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6030   predicate(UseAVX > 0);
 6031   match(Set dst (DivVD src1 src2));
 6032   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6033   ins_encode %{
 6034     int vlen_enc = vector_length_encoding(this);
 6035     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6036   %}
 6037   ins_pipe( pipe_slow );
 6038 %}
 6039 
 6040 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6041   predicate((UseAVX > 0) &&
 6042             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6043   match(Set dst (DivVD src (LoadVector mem)));
 6044   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6045   ins_encode %{
 6046     int vlen_enc = vector_length_encoding(this);
 6047     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6048   %}
 6049   ins_pipe( pipe_slow );
 6050 %}
 6051 
 6052 // ------------------------------ MinMax ---------------------------------------
 6053 
 6054 // Byte, Short, Int vector Min/Max
 6055 instruct minmax_reg_sse(vec dst, vec src) %{
 6056   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6057             UseAVX == 0);
 6058   match(Set dst (MinV dst src));
 6059   match(Set dst (MaxV dst src));
 6060   format %{ "vector_minmax  $dst,$src\t!  " %}
 6061   ins_encode %{
 6062     assert(UseSSE >= 4, "required");
 6063 
 6064     int opcode = this->ideal_Opcode();
 6065     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6066     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6067   %}
 6068   ins_pipe( pipe_slow );
 6069 %}
 6070 
 6071 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6072   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6073             UseAVX > 0);
 6074   match(Set dst (MinV src1 src2));
 6075   match(Set dst (MaxV src1 src2));
 6076   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6077   ins_encode %{
 6078     int opcode = this->ideal_Opcode();
 6079     int vlen_enc = vector_length_encoding(this);
 6080     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6081 
 6082     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6083   %}
 6084   ins_pipe( pipe_slow );
 6085 %}
 6086 
 6087 // Long vector Min/Max
 6088 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6089   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6090             UseAVX == 0);
 6091   match(Set dst (MinV dst src));
 6092   match(Set dst (MaxV src dst));
 6093   effect(TEMP dst, TEMP tmp);
 6094   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6095   ins_encode %{
 6096     assert(UseSSE >= 4, "required");
 6097 
 6098     int opcode = this->ideal_Opcode();
 6099     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6100     assert(elem_bt == T_LONG, "sanity");
 6101 
 6102     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6103   %}
 6104   ins_pipe( pipe_slow );
 6105 %}
 6106 
 6107 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6108   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6109             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6110   match(Set dst (MinV src1 src2));
 6111   match(Set dst (MaxV src1 src2));
 6112   effect(TEMP dst);
 6113   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6114   ins_encode %{
 6115     int vlen_enc = vector_length_encoding(this);
 6116     int opcode = this->ideal_Opcode();
 6117     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6118     assert(elem_bt == T_LONG, "sanity");
 6119 
 6120     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6121   %}
 6122   ins_pipe( pipe_slow );
 6123 %}
 6124 
 6125 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6126   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6127             Matcher::vector_element_basic_type(n) == T_LONG);
 6128   match(Set dst (MinV src1 src2));
 6129   match(Set dst (MaxV src1 src2));
 6130   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6131   ins_encode %{
 6132     assert(UseAVX > 2, "required");
 6133 
 6134     int vlen_enc = vector_length_encoding(this);
 6135     int opcode = this->ideal_Opcode();
 6136     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6137     assert(elem_bt == T_LONG, "sanity");
 6138 
 6139     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6140   %}
 6141   ins_pipe( pipe_slow );
 6142 %}
 6143 
 6144 // Float/Double vector Min/Max
 6145 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6146   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6147             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6148             UseAVX > 0);
 6149   match(Set dst (MinV a b));
 6150   match(Set dst (MaxV a b));
 6151   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6152   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6153   ins_encode %{
 6154     assert(UseAVX > 0, "required");
 6155 
 6156     int opcode = this->ideal_Opcode();
 6157     int vlen_enc = vector_length_encoding(this);
 6158     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6159 
 6160     __ vminmax_fp(opcode, elem_bt,
 6161                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6162                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6163   %}
 6164   ins_pipe( pipe_slow );
 6165 %}
 6166 
 6167 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6168   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6169             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6170   match(Set dst (MinV a b));
 6171   match(Set dst (MaxV a b));
 6172   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6173   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6174   ins_encode %{
 6175     assert(UseAVX > 2, "required");
 6176 
 6177     int opcode = this->ideal_Opcode();
 6178     int vlen_enc = vector_length_encoding(this);
 6179     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6180 
 6181     __ evminmax_fp(opcode, elem_bt,
 6182                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6183                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6184   %}
 6185   ins_pipe( pipe_slow );
 6186 %}
 6187 
 6188 // --------------------------------- Signum/CopySign ---------------------------
 6189 
 6190 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6191   match(Set dst (SignumF dst (Binary zero one)));
 6192   effect(KILL cr);
 6193   format %{ "signumF $dst, $dst" %}
 6194   ins_encode %{
 6195     int opcode = this->ideal_Opcode();
 6196     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6197   %}
 6198   ins_pipe( pipe_slow );
 6199 %}
 6200 
 6201 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6202   match(Set dst (SignumD dst (Binary zero one)));
 6203   effect(KILL cr);
 6204   format %{ "signumD $dst, $dst" %}
 6205   ins_encode %{
 6206     int opcode = this->ideal_Opcode();
 6207     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6208   %}
 6209   ins_pipe( pipe_slow );
 6210 %}
 6211 
 6212 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6213   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6214   match(Set dst (SignumVF src (Binary zero one)));
 6215   match(Set dst (SignumVD src (Binary zero one)));
 6216   effect(TEMP dst, TEMP xtmp1);
 6217   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6218   ins_encode %{
 6219     int opcode = this->ideal_Opcode();
 6220     int vec_enc = vector_length_encoding(this);
 6221     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6222                          $xtmp1$$XMMRegister, vec_enc);
 6223   %}
 6224   ins_pipe( pipe_slow );
 6225 %}
 6226 
 6227 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6228   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6229   match(Set dst (SignumVF src (Binary zero one)));
 6230   match(Set dst (SignumVD src (Binary zero one)));
 6231   effect(TEMP dst, TEMP ktmp1);
 6232   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6233   ins_encode %{
 6234     int opcode = this->ideal_Opcode();
 6235     int vec_enc = vector_length_encoding(this);
 6236     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6237                           $ktmp1$$KRegister, vec_enc);
 6238   %}
 6239   ins_pipe( pipe_slow );
 6240 %}
 6241 
 6242 // ---------------------------------------
 6243 // For copySign use 0xE4 as writemask for vpternlog
 6244 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6245 // C (xmm2) is set to 0x7FFFFFFF
 6246 // Wherever xmm2 is 0, we want to pick from B (sign)
 6247 // Wherever xmm2 is 1, we want to pick from A (src)
 6248 //
 6249 // A B C Result
 6250 // 0 0 0 0
 6251 // 0 0 1 0
 6252 // 0 1 0 1
 6253 // 0 1 1 0
 6254 // 1 0 0 0
 6255 // 1 0 1 1
 6256 // 1 1 0 1
 6257 // 1 1 1 1
 6258 //
 6259 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6260 // ---------------------------------------
 6261 
 6262 #ifdef _LP64
 6263 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6264   match(Set dst (CopySignF dst src));
 6265   effect(TEMP tmp1, TEMP tmp2);
 6266   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6267   ins_encode %{
 6268     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6269     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6270     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6271   %}
 6272   ins_pipe( pipe_slow );
 6273 %}
 6274 
 6275 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6276   match(Set dst (CopySignD dst (Binary src zero)));
 6277   ins_cost(100);
 6278   effect(TEMP tmp1, TEMP tmp2);
 6279   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6280   ins_encode %{
 6281     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6282     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6283     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6284   %}
 6285   ins_pipe( pipe_slow );
 6286 %}
 6287 
 6288 #endif // _LP64
 6289 
 6290 //----------------------------- CompressBits/ExpandBits ------------------------
 6291 
 6292 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6293   predicate(n->bottom_type()->isa_int());
 6294   match(Set dst (CompressBits src mask));
 6295   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6296   ins_encode %{
 6297     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6298   %}
 6299   ins_pipe( pipe_slow );
 6300 %}
 6301 
 6302 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6303   predicate(n->bottom_type()->isa_int());
 6304   match(Set dst (ExpandBits src mask));
 6305   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6306   ins_encode %{
 6307     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6308   %}
 6309   ins_pipe( pipe_slow );
 6310 %}
 6311 
 6312 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6313   predicate(n->bottom_type()->isa_int());
 6314   match(Set dst (CompressBits src (LoadI mask)));
 6315   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6316   ins_encode %{
 6317     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6318   %}
 6319   ins_pipe( pipe_slow );
 6320 %}
 6321 
 6322 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6323   predicate(n->bottom_type()->isa_int());
 6324   match(Set dst (ExpandBits src (LoadI mask)));
 6325   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6326   ins_encode %{
 6327     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6328   %}
 6329   ins_pipe( pipe_slow );
 6330 %}
 6331 
 6332 // --------------------------------- Sqrt --------------------------------------
 6333 
 6334 instruct vsqrtF_reg(vec dst, vec src) %{
 6335   match(Set dst (SqrtVF src));
 6336   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6337   ins_encode %{
 6338     assert(UseAVX > 0, "required");
 6339     int vlen_enc = vector_length_encoding(this);
 6340     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6341   %}
 6342   ins_pipe( pipe_slow );
 6343 %}
 6344 
 6345 instruct vsqrtF_mem(vec dst, memory mem) %{
 6346   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6347   match(Set dst (SqrtVF (LoadVector mem)));
 6348   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6349   ins_encode %{
 6350     assert(UseAVX > 0, "required");
 6351     int vlen_enc = vector_length_encoding(this);
 6352     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6353   %}
 6354   ins_pipe( pipe_slow );
 6355 %}
 6356 
 6357 // Floating point vector sqrt
 6358 instruct vsqrtD_reg(vec dst, vec src) %{
 6359   match(Set dst (SqrtVD src));
 6360   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6361   ins_encode %{
 6362     assert(UseAVX > 0, "required");
 6363     int vlen_enc = vector_length_encoding(this);
 6364     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6365   %}
 6366   ins_pipe( pipe_slow );
 6367 %}
 6368 
 6369 instruct vsqrtD_mem(vec dst, memory mem) %{
 6370   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6371   match(Set dst (SqrtVD (LoadVector mem)));
 6372   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6373   ins_encode %{
 6374     assert(UseAVX > 0, "required");
 6375     int vlen_enc = vector_length_encoding(this);
 6376     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6377   %}
 6378   ins_pipe( pipe_slow );
 6379 %}
 6380 
 6381 // ------------------------------ Shift ---------------------------------------
 6382 
 6383 // Left and right shift count vectors are the same on x86
 6384 // (only lowest bits of xmm reg are used for count).
 6385 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6386   match(Set dst (LShiftCntV cnt));
 6387   match(Set dst (RShiftCntV cnt));
 6388   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6389   ins_encode %{
 6390     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6391   %}
 6392   ins_pipe( pipe_slow );
 6393 %}
 6394 
 6395 // Byte vector shift
 6396 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6397   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6398   match(Set dst ( LShiftVB src shift));
 6399   match(Set dst ( RShiftVB src shift));
 6400   match(Set dst (URShiftVB src shift));
 6401   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6402   format %{"vector_byte_shift $dst,$src,$shift" %}
 6403   ins_encode %{
 6404     assert(UseSSE > 3, "required");
 6405     int opcode = this->ideal_Opcode();
 6406     bool sign = (opcode != Op_URShiftVB);
 6407     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6408     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6409     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6410     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6411     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6412   %}
 6413   ins_pipe( pipe_slow );
 6414 %}
 6415 
 6416 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6417   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6418             UseAVX <= 1);
 6419   match(Set dst ( LShiftVB src shift));
 6420   match(Set dst ( RShiftVB src shift));
 6421   match(Set dst (URShiftVB src shift));
 6422   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6423   format %{"vector_byte_shift $dst,$src,$shift" %}
 6424   ins_encode %{
 6425     assert(UseSSE > 3, "required");
 6426     int opcode = this->ideal_Opcode();
 6427     bool sign = (opcode != Op_URShiftVB);
 6428     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6429     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6430     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6431     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6432     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6433     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6434     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6435     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6436     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6437   %}
 6438   ins_pipe( pipe_slow );
 6439 %}
 6440 
 6441 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6442   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6443             UseAVX > 1);
 6444   match(Set dst ( LShiftVB src shift));
 6445   match(Set dst ( RShiftVB src shift));
 6446   match(Set dst (URShiftVB src shift));
 6447   effect(TEMP dst, TEMP tmp);
 6448   format %{"vector_byte_shift $dst,$src,$shift" %}
 6449   ins_encode %{
 6450     int opcode = this->ideal_Opcode();
 6451     bool sign = (opcode != Op_URShiftVB);
 6452     int vlen_enc = Assembler::AVX_256bit;
 6453     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6454     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6455     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6456     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6457     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6458   %}
 6459   ins_pipe( pipe_slow );
 6460 %}
 6461 
 6462 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6463   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6464   match(Set dst ( LShiftVB src shift));
 6465   match(Set dst ( RShiftVB src shift));
 6466   match(Set dst (URShiftVB src shift));
 6467   effect(TEMP dst, TEMP tmp);
 6468   format %{"vector_byte_shift $dst,$src,$shift" %}
 6469   ins_encode %{
 6470     assert(UseAVX > 1, "required");
 6471     int opcode = this->ideal_Opcode();
 6472     bool sign = (opcode != Op_URShiftVB);
 6473     int vlen_enc = Assembler::AVX_256bit;
 6474     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6475     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6476     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6477     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6478     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6479     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6480     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6481     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6482     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6483   %}
 6484   ins_pipe( pipe_slow );
 6485 %}
 6486 
 6487 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6488   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6489   match(Set dst ( LShiftVB src shift));
 6490   match(Set dst  (RShiftVB src shift));
 6491   match(Set dst (URShiftVB src shift));
 6492   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6493   format %{"vector_byte_shift $dst,$src,$shift" %}
 6494   ins_encode %{
 6495     assert(UseAVX > 2, "required");
 6496     int opcode = this->ideal_Opcode();
 6497     bool sign = (opcode != Op_URShiftVB);
 6498     int vlen_enc = Assembler::AVX_512bit;
 6499     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6500     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6501     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6502     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6503     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6504     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6505     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6506     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6507     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6508     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6509     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6510     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6511   %}
 6512   ins_pipe( pipe_slow );
 6513 %}
 6514 
 6515 // Shorts vector logical right shift produces incorrect Java result
 6516 // for negative data because java code convert short value into int with
 6517 // sign extension before a shift. But char vectors are fine since chars are
 6518 // unsigned values.
 6519 // Shorts/Chars vector left shift
 6520 instruct vshiftS(vec dst, vec src, vec shift) %{
 6521   predicate(!n->as_ShiftV()->is_var_shift());
 6522   match(Set dst ( LShiftVS src shift));
 6523   match(Set dst ( RShiftVS src shift));
 6524   match(Set dst (URShiftVS src shift));
 6525   effect(TEMP dst, USE src, USE shift);
 6526   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6527   ins_encode %{
 6528     int opcode = this->ideal_Opcode();
 6529     if (UseAVX > 0) {
 6530       int vlen_enc = vector_length_encoding(this);
 6531       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6532     } else {
 6533       int vlen = Matcher::vector_length(this);
 6534       if (vlen == 2) {
 6535         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6536         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6537       } else if (vlen == 4) {
 6538         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6539         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6540       } else {
 6541         assert (vlen == 8, "sanity");
 6542         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6543         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6544       }
 6545     }
 6546   %}
 6547   ins_pipe( pipe_slow );
 6548 %}
 6549 
 6550 // Integers vector left shift
 6551 instruct vshiftI(vec dst, vec src, vec shift) %{
 6552   predicate(!n->as_ShiftV()->is_var_shift());
 6553   match(Set dst ( LShiftVI src shift));
 6554   match(Set dst ( RShiftVI src shift));
 6555   match(Set dst (URShiftVI src shift));
 6556   effect(TEMP dst, USE src, USE shift);
 6557   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6558   ins_encode %{
 6559     int opcode = this->ideal_Opcode();
 6560     if (UseAVX > 0) {
 6561       int vlen_enc = vector_length_encoding(this);
 6562       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6563     } else {
 6564       int vlen = Matcher::vector_length(this);
 6565       if (vlen == 2) {
 6566         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6567         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6568       } else {
 6569         assert(vlen == 4, "sanity");
 6570         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6571         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6572       }
 6573     }
 6574   %}
 6575   ins_pipe( pipe_slow );
 6576 %}
 6577 
 6578 // Integers vector left constant shift
 6579 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6580   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6581   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6582   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6583   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6584   ins_encode %{
 6585     int opcode = this->ideal_Opcode();
 6586     if (UseAVX > 0) {
 6587       int vector_len = vector_length_encoding(this);
 6588       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6589     } else {
 6590       int vlen = Matcher::vector_length(this);
 6591       if (vlen == 2) {
 6592         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6593         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6594       } else {
 6595         assert(vlen == 4, "sanity");
 6596         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6597         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6598       }
 6599     }
 6600   %}
 6601   ins_pipe( pipe_slow );
 6602 %}
 6603 
 6604 // Longs vector shift
 6605 instruct vshiftL(vec dst, vec src, vec shift) %{
 6606   predicate(!n->as_ShiftV()->is_var_shift());
 6607   match(Set dst ( LShiftVL src shift));
 6608   match(Set dst (URShiftVL src shift));
 6609   effect(TEMP dst, USE src, USE shift);
 6610   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6611   ins_encode %{
 6612     int opcode = this->ideal_Opcode();
 6613     if (UseAVX > 0) {
 6614       int vlen_enc = vector_length_encoding(this);
 6615       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6616     } else {
 6617       assert(Matcher::vector_length(this) == 2, "");
 6618       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6619       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6620     }
 6621   %}
 6622   ins_pipe( pipe_slow );
 6623 %}
 6624 
 6625 // Longs vector constant shift
 6626 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6627   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6628   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6629   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6630   ins_encode %{
 6631     int opcode = this->ideal_Opcode();
 6632     if (UseAVX > 0) {
 6633       int vector_len = vector_length_encoding(this);
 6634       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6635     } else {
 6636       assert(Matcher::vector_length(this) == 2, "");
 6637       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6638       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6639     }
 6640   %}
 6641   ins_pipe( pipe_slow );
 6642 %}
 6643 
 6644 // -------------------ArithmeticRightShift -----------------------------------
 6645 // Long vector arithmetic right shift
 6646 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6647   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6648   match(Set dst (RShiftVL src shift));
 6649   effect(TEMP dst, TEMP tmp);
 6650   format %{ "vshiftq $dst,$src,$shift" %}
 6651   ins_encode %{
 6652     uint vlen = Matcher::vector_length(this);
 6653     if (vlen == 2) {
 6654       assert(UseSSE >= 2, "required");
 6655       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6656       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6657       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6658       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6659       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6660       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6661     } else {
 6662       assert(vlen == 4, "sanity");
 6663       assert(UseAVX > 1, "required");
 6664       int vlen_enc = Assembler::AVX_256bit;
 6665       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6666       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6667       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6668       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6669       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6670     }
 6671   %}
 6672   ins_pipe( pipe_slow );
 6673 %}
 6674 
 6675 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6676   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6677   match(Set dst (RShiftVL src shift));
 6678   format %{ "vshiftq $dst,$src,$shift" %}
 6679   ins_encode %{
 6680     int vlen_enc = vector_length_encoding(this);
 6681     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6682   %}
 6683   ins_pipe( pipe_slow );
 6684 %}
 6685 
 6686 // ------------------- Variable Shift -----------------------------
 6687 // Byte variable shift
 6688 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6689   predicate(Matcher::vector_length(n) <= 8 &&
 6690             n->as_ShiftV()->is_var_shift() &&
 6691             !VM_Version::supports_avx512bw());
 6692   match(Set dst ( LShiftVB src shift));
 6693   match(Set dst ( RShiftVB src shift));
 6694   match(Set dst (URShiftVB src shift));
 6695   effect(TEMP dst, TEMP vtmp);
 6696   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6697   ins_encode %{
 6698     assert(UseAVX >= 2, "required");
 6699 
 6700     int opcode = this->ideal_Opcode();
 6701     int vlen_enc = Assembler::AVX_128bit;
 6702     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6703     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6704   %}
 6705   ins_pipe( pipe_slow );
 6706 %}
 6707 
 6708 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6709   predicate(Matcher::vector_length(n) == 16 &&
 6710             n->as_ShiftV()->is_var_shift() &&
 6711             !VM_Version::supports_avx512bw());
 6712   match(Set dst ( LShiftVB src shift));
 6713   match(Set dst ( RShiftVB src shift));
 6714   match(Set dst (URShiftVB src shift));
 6715   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6716   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6717   ins_encode %{
 6718     assert(UseAVX >= 2, "required");
 6719 
 6720     int opcode = this->ideal_Opcode();
 6721     int vlen_enc = Assembler::AVX_128bit;
 6722     // Shift lower half and get word result in dst
 6723     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6724 
 6725     // Shift upper half and get word result in vtmp1
 6726     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6727     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6728     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6729 
 6730     // Merge and down convert the two word results to byte in dst
 6731     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6732   %}
 6733   ins_pipe( pipe_slow );
 6734 %}
 6735 
 6736 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6737   predicate(Matcher::vector_length(n) == 32 &&
 6738             n->as_ShiftV()->is_var_shift() &&
 6739             !VM_Version::supports_avx512bw());
 6740   match(Set dst ( LShiftVB src shift));
 6741   match(Set dst ( RShiftVB src shift));
 6742   match(Set dst (URShiftVB src shift));
 6743   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6744   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6745   ins_encode %{
 6746     assert(UseAVX >= 2, "required");
 6747 
 6748     int opcode = this->ideal_Opcode();
 6749     int vlen_enc = Assembler::AVX_128bit;
 6750     // Process lower 128 bits and get result in dst
 6751     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6752     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6753     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6754     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6755     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6756 
 6757     // Process higher 128 bits and get result in vtmp3
 6758     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6759     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6760     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6761     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6762     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6763     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6764     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6765 
 6766     // Merge the two results in dst
 6767     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6768   %}
 6769   ins_pipe( pipe_slow );
 6770 %}
 6771 
 6772 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6773   predicate(Matcher::vector_length(n) <= 32 &&
 6774             n->as_ShiftV()->is_var_shift() &&
 6775             VM_Version::supports_avx512bw());
 6776   match(Set dst ( LShiftVB src shift));
 6777   match(Set dst ( RShiftVB src shift));
 6778   match(Set dst (URShiftVB src shift));
 6779   effect(TEMP dst, TEMP vtmp);
 6780   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6781   ins_encode %{
 6782     assert(UseAVX > 2, "required");
 6783 
 6784     int opcode = this->ideal_Opcode();
 6785     int vlen_enc = vector_length_encoding(this);
 6786     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6787   %}
 6788   ins_pipe( pipe_slow );
 6789 %}
 6790 
 6791 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6792   predicate(Matcher::vector_length(n) == 64 &&
 6793             n->as_ShiftV()->is_var_shift() &&
 6794             VM_Version::supports_avx512bw());
 6795   match(Set dst ( LShiftVB src shift));
 6796   match(Set dst ( RShiftVB src shift));
 6797   match(Set dst (URShiftVB src shift));
 6798   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6799   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6800   ins_encode %{
 6801     assert(UseAVX > 2, "required");
 6802 
 6803     int opcode = this->ideal_Opcode();
 6804     int vlen_enc = Assembler::AVX_256bit;
 6805     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6806     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6807     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6808     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6809     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6810   %}
 6811   ins_pipe( pipe_slow );
 6812 %}
 6813 
 6814 // Short variable shift
 6815 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6816   predicate(Matcher::vector_length(n) <= 8 &&
 6817             n->as_ShiftV()->is_var_shift() &&
 6818             !VM_Version::supports_avx512bw());
 6819   match(Set dst ( LShiftVS src shift));
 6820   match(Set dst ( RShiftVS src shift));
 6821   match(Set dst (URShiftVS src shift));
 6822   effect(TEMP dst, TEMP vtmp);
 6823   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6824   ins_encode %{
 6825     assert(UseAVX >= 2, "required");
 6826 
 6827     int opcode = this->ideal_Opcode();
 6828     bool sign = (opcode != Op_URShiftVS);
 6829     int vlen_enc = Assembler::AVX_256bit;
 6830     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6831     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6832     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6833     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6834     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6835     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6836   %}
 6837   ins_pipe( pipe_slow );
 6838 %}
 6839 
 6840 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6841   predicate(Matcher::vector_length(n) == 16 &&
 6842             n->as_ShiftV()->is_var_shift() &&
 6843             !VM_Version::supports_avx512bw());
 6844   match(Set dst ( LShiftVS src shift));
 6845   match(Set dst ( RShiftVS src shift));
 6846   match(Set dst (URShiftVS src shift));
 6847   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6848   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6849   ins_encode %{
 6850     assert(UseAVX >= 2, "required");
 6851 
 6852     int opcode = this->ideal_Opcode();
 6853     bool sign = (opcode != Op_URShiftVS);
 6854     int vlen_enc = Assembler::AVX_256bit;
 6855     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6856     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6857     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6858     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6859     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6860 
 6861     // Shift upper half, with result in dst using vtmp1 as TEMP
 6862     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6863     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6864     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6865     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6866     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6867     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6868 
 6869     // Merge lower and upper half result into dst
 6870     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6871     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6872   %}
 6873   ins_pipe( pipe_slow );
 6874 %}
 6875 
 6876 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6877   predicate(n->as_ShiftV()->is_var_shift() &&
 6878             VM_Version::supports_avx512bw());
 6879   match(Set dst ( LShiftVS src shift));
 6880   match(Set dst ( RShiftVS src shift));
 6881   match(Set dst (URShiftVS src shift));
 6882   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6883   ins_encode %{
 6884     assert(UseAVX > 2, "required");
 6885 
 6886     int opcode = this->ideal_Opcode();
 6887     int vlen_enc = vector_length_encoding(this);
 6888     if (!VM_Version::supports_avx512vl()) {
 6889       vlen_enc = Assembler::AVX_512bit;
 6890     }
 6891     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6892   %}
 6893   ins_pipe( pipe_slow );
 6894 %}
 6895 
 6896 //Integer variable shift
 6897 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6898   predicate(n->as_ShiftV()->is_var_shift());
 6899   match(Set dst ( LShiftVI src shift));
 6900   match(Set dst ( RShiftVI src shift));
 6901   match(Set dst (URShiftVI src shift));
 6902   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6903   ins_encode %{
 6904     assert(UseAVX >= 2, "required");
 6905 
 6906     int opcode = this->ideal_Opcode();
 6907     int vlen_enc = vector_length_encoding(this);
 6908     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6909   %}
 6910   ins_pipe( pipe_slow );
 6911 %}
 6912 
 6913 //Long variable shift
 6914 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6915   predicate(n->as_ShiftV()->is_var_shift());
 6916   match(Set dst ( LShiftVL src shift));
 6917   match(Set dst (URShiftVL src shift));
 6918   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6919   ins_encode %{
 6920     assert(UseAVX >= 2, "required");
 6921 
 6922     int opcode = this->ideal_Opcode();
 6923     int vlen_enc = vector_length_encoding(this);
 6924     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6925   %}
 6926   ins_pipe( pipe_slow );
 6927 %}
 6928 
 6929 //Long variable right shift arithmetic
 6930 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6931   predicate(Matcher::vector_length(n) <= 4 &&
 6932             n->as_ShiftV()->is_var_shift() &&
 6933             UseAVX == 2);
 6934   match(Set dst (RShiftVL src shift));
 6935   effect(TEMP dst, TEMP vtmp);
 6936   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6937   ins_encode %{
 6938     int opcode = this->ideal_Opcode();
 6939     int vlen_enc = vector_length_encoding(this);
 6940     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 6941                  $vtmp$$XMMRegister);
 6942   %}
 6943   ins_pipe( pipe_slow );
 6944 %}
 6945 
 6946 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 6947   predicate(n->as_ShiftV()->is_var_shift() &&
 6948             UseAVX > 2);
 6949   match(Set dst (RShiftVL src shift));
 6950   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 6951   ins_encode %{
 6952     int opcode = this->ideal_Opcode();
 6953     int vlen_enc = vector_length_encoding(this);
 6954     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6955   %}
 6956   ins_pipe( pipe_slow );
 6957 %}
 6958 
 6959 // --------------------------------- AND --------------------------------------
 6960 
 6961 instruct vand(vec dst, vec src) %{
 6962   predicate(UseAVX == 0);
 6963   match(Set dst (AndV dst src));
 6964   format %{ "pand    $dst,$src\t! and vectors" %}
 6965   ins_encode %{
 6966     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 6967   %}
 6968   ins_pipe( pipe_slow );
 6969 %}
 6970 
 6971 instruct vand_reg(vec dst, vec src1, vec src2) %{
 6972   predicate(UseAVX > 0);
 6973   match(Set dst (AndV src1 src2));
 6974   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 6975   ins_encode %{
 6976     int vlen_enc = vector_length_encoding(this);
 6977     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6978   %}
 6979   ins_pipe( pipe_slow );
 6980 %}
 6981 
 6982 instruct vand_mem(vec dst, vec src, memory mem) %{
 6983   predicate((UseAVX > 0) &&
 6984             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6985   match(Set dst (AndV src (LoadVector mem)));
 6986   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 6987   ins_encode %{
 6988     int vlen_enc = vector_length_encoding(this);
 6989     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6990   %}
 6991   ins_pipe( pipe_slow );
 6992 %}
 6993 
 6994 // --------------------------------- OR ---------------------------------------
 6995 
 6996 instruct vor(vec dst, vec src) %{
 6997   predicate(UseAVX == 0);
 6998   match(Set dst (OrV dst src));
 6999   format %{ "por     $dst,$src\t! or vectors" %}
 7000   ins_encode %{
 7001     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7002   %}
 7003   ins_pipe( pipe_slow );
 7004 %}
 7005 
 7006 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7007   predicate(UseAVX > 0);
 7008   match(Set dst (OrV src1 src2));
 7009   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7010   ins_encode %{
 7011     int vlen_enc = vector_length_encoding(this);
 7012     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7013   %}
 7014   ins_pipe( pipe_slow );
 7015 %}
 7016 
 7017 instruct vor_mem(vec dst, vec src, memory mem) %{
 7018   predicate((UseAVX > 0) &&
 7019             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7020   match(Set dst (OrV src (LoadVector mem)));
 7021   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7022   ins_encode %{
 7023     int vlen_enc = vector_length_encoding(this);
 7024     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7025   %}
 7026   ins_pipe( pipe_slow );
 7027 %}
 7028 
 7029 // --------------------------------- XOR --------------------------------------
 7030 
 7031 instruct vxor(vec dst, vec src) %{
 7032   predicate(UseAVX == 0);
 7033   match(Set dst (XorV dst src));
 7034   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7035   ins_encode %{
 7036     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7037   %}
 7038   ins_pipe( pipe_slow );
 7039 %}
 7040 
 7041 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7042   predicate(UseAVX > 0);
 7043   match(Set dst (XorV src1 src2));
 7044   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7045   ins_encode %{
 7046     int vlen_enc = vector_length_encoding(this);
 7047     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7048   %}
 7049   ins_pipe( pipe_slow );
 7050 %}
 7051 
 7052 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7053   predicate((UseAVX > 0) &&
 7054             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7055   match(Set dst (XorV src (LoadVector mem)));
 7056   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7057   ins_encode %{
 7058     int vlen_enc = vector_length_encoding(this);
 7059     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7060   %}
 7061   ins_pipe( pipe_slow );
 7062 %}
 7063 
 7064 // --------------------------------- VectorCast --------------------------------------
 7065 
 7066 instruct vcastBtoX(vec dst, vec src) %{
 7067   match(Set dst (VectorCastB2X src));
 7068   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7069   ins_encode %{
 7070     assert(UseAVX > 0, "required");
 7071 
 7072     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7073     int vlen_enc = vector_length_encoding(this);
 7074     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7075   %}
 7076   ins_pipe( pipe_slow );
 7077 %}
 7078 
 7079 instruct castStoX(vec dst, vec src) %{
 7080   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7081             Matcher::vector_length(n->in(1)) <= 8 && // src
 7082             Matcher::vector_element_basic_type(n) == T_BYTE);
 7083   match(Set dst (VectorCastS2X src));
 7084   format %{ "vector_cast_s2x $dst,$src" %}
 7085   ins_encode %{
 7086     assert(UseAVX > 0, "required");
 7087 
 7088     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7089     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7090   %}
 7091   ins_pipe( pipe_slow );
 7092 %}
 7093 
 7094 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7095   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7096             Matcher::vector_length(n->in(1)) == 16 && // src
 7097             Matcher::vector_element_basic_type(n) == T_BYTE);
 7098   effect(TEMP dst, TEMP vtmp);
 7099   match(Set dst (VectorCastS2X src));
 7100   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7101   ins_encode %{
 7102     assert(UseAVX > 0, "required");
 7103 
 7104     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7105     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7106     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7107     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7108   %}
 7109   ins_pipe( pipe_slow );
 7110 %}
 7111 
 7112 instruct vcastStoX_evex(vec dst, vec src) %{
 7113   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7114             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7115   match(Set dst (VectorCastS2X src));
 7116   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7117   ins_encode %{
 7118     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7119     int src_vlen_enc = vector_length_encoding(this, $src);
 7120     int vlen_enc = vector_length_encoding(this);
 7121     switch (to_elem_bt) {
 7122       case T_BYTE:
 7123         if (!VM_Version::supports_avx512vl()) {
 7124           vlen_enc = Assembler::AVX_512bit;
 7125         }
 7126         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7127         break;
 7128       case T_INT:
 7129         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7130         break;
 7131       case T_FLOAT:
 7132         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7133         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7134         break;
 7135       case T_LONG:
 7136         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7137         break;
 7138       case T_DOUBLE: {
 7139         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7140         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7141         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7142         break;
 7143       }
 7144       default:
 7145         ShouldNotReachHere();
 7146     }
 7147   %}
 7148   ins_pipe( pipe_slow );
 7149 %}
 7150 
 7151 instruct castItoX(vec dst, vec src) %{
 7152   predicate(UseAVX <= 2 &&
 7153             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7154             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7155   match(Set dst (VectorCastI2X src));
 7156   format %{ "vector_cast_i2x $dst,$src" %}
 7157   ins_encode %{
 7158     assert(UseAVX > 0, "required");
 7159 
 7160     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7161     int vlen_enc = vector_length_encoding(this, $src);
 7162 
 7163     if (to_elem_bt == T_BYTE) {
 7164       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7165       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7166       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7167     } else {
 7168       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7169       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7170       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7171     }
 7172   %}
 7173   ins_pipe( pipe_slow );
 7174 %}
 7175 
 7176 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7177   predicate(UseAVX <= 2 &&
 7178             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7179             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7180   match(Set dst (VectorCastI2X src));
 7181   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7182   effect(TEMP dst, TEMP vtmp);
 7183   ins_encode %{
 7184     assert(UseAVX > 0, "required");
 7185 
 7186     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7187     int vlen_enc = vector_length_encoding(this, $src);
 7188 
 7189     if (to_elem_bt == T_BYTE) {
 7190       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7191       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7192       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7193       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7194     } else {
 7195       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7196       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7197       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7198       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7199     }
 7200   %}
 7201   ins_pipe( pipe_slow );
 7202 %}
 7203 
 7204 instruct vcastItoX_evex(vec dst, vec src) %{
 7205   predicate(UseAVX > 2 ||
 7206             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7207   match(Set dst (VectorCastI2X src));
 7208   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7209   ins_encode %{
 7210     assert(UseAVX > 0, "required");
 7211 
 7212     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7213     int src_vlen_enc = vector_length_encoding(this, $src);
 7214     int dst_vlen_enc = vector_length_encoding(this);
 7215     switch (dst_elem_bt) {
 7216       case T_BYTE:
 7217         if (!VM_Version::supports_avx512vl()) {
 7218           src_vlen_enc = Assembler::AVX_512bit;
 7219         }
 7220         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7221         break;
 7222       case T_SHORT:
 7223         if (!VM_Version::supports_avx512vl()) {
 7224           src_vlen_enc = Assembler::AVX_512bit;
 7225         }
 7226         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7227         break;
 7228       case T_FLOAT:
 7229         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7230         break;
 7231       case T_LONG:
 7232         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7233         break;
 7234       case T_DOUBLE:
 7235         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7236         break;
 7237       default:
 7238         ShouldNotReachHere();
 7239     }
 7240   %}
 7241   ins_pipe( pipe_slow );
 7242 %}
 7243 
 7244 instruct vcastLtoBS(vec dst, vec src) %{
 7245   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7246             UseAVX <= 2);
 7247   match(Set dst (VectorCastL2X src));
 7248   format %{ "vector_cast_l2x  $dst,$src" %}
 7249   ins_encode %{
 7250     assert(UseAVX > 0, "required");
 7251 
 7252     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7253     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7254     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7255                                                       : ExternalAddress(vector_int_to_short_mask());
 7256     if (vlen <= 16) {
 7257       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7258       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7259       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7260     } else {
 7261       assert(vlen <= 32, "required");
 7262       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7263       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7264       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7265       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7266     }
 7267     if (to_elem_bt == T_BYTE) {
 7268       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7269     }
 7270   %}
 7271   ins_pipe( pipe_slow );
 7272 %}
 7273 
 7274 instruct vcastLtoX_evex(vec dst, vec src) %{
 7275   predicate(UseAVX > 2 ||
 7276             (Matcher::vector_element_basic_type(n) == T_INT ||
 7277              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7278              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7279   match(Set dst (VectorCastL2X src));
 7280   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7281   ins_encode %{
 7282     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7283     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7284     int vlen_enc = vector_length_encoding(this, $src);
 7285     switch (to_elem_bt) {
 7286       case T_BYTE:
 7287         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7288           vlen_enc = Assembler::AVX_512bit;
 7289         }
 7290         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7291         break;
 7292       case T_SHORT:
 7293         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7294           vlen_enc = Assembler::AVX_512bit;
 7295         }
 7296         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7297         break;
 7298       case T_INT:
 7299         if (vlen == 8) {
 7300           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7301             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7302           }
 7303         } else if (vlen == 16) {
 7304           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7305         } else if (vlen == 32) {
 7306           if (UseAVX > 2) {
 7307             if (!VM_Version::supports_avx512vl()) {
 7308               vlen_enc = Assembler::AVX_512bit;
 7309             }
 7310             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7311           } else {
 7312             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7313             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7314           }
 7315         } else { // vlen == 64
 7316           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7317         }
 7318         break;
 7319       case T_FLOAT:
 7320         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7321         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7322         break;
 7323       case T_DOUBLE:
 7324         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7325         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7326         break;
 7327 
 7328       default: assert(false, "%s", type2name(to_elem_bt));
 7329     }
 7330   %}
 7331   ins_pipe( pipe_slow );
 7332 %}
 7333 
 7334 instruct vcastFtoD_reg(vec dst, vec src) %{
 7335   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7336   match(Set dst (VectorCastF2X src));
 7337   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7338   ins_encode %{
 7339     int vlen_enc = vector_length_encoding(this);
 7340     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7341   %}
 7342   ins_pipe( pipe_slow );
 7343 %}
 7344 
 7345 
 7346 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7347   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7348             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7349   match(Set dst (VectorCastF2X src));
 7350   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7351   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7352   ins_encode %{
 7353     int vlen_enc = vector_length_encoding(this, $src);
 7354     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7355     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7356     // 32 bit addresses for register indirect addressing mode since stub constants
 7357     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7358     // However, targets are free to increase this limit, but having a large code cache size
 7359     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7360     // cap we save a temporary register allocation which in limiting case can prevent
 7361     // spilling in high register pressure blocks.
 7362     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7363                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7364                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7365   %}
 7366   ins_pipe( pipe_slow );
 7367 %}
 7368 
 7369 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7370   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7371             is_integral_type(Matcher::vector_element_basic_type(n)));
 7372   match(Set dst (VectorCastF2X src));
 7373   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7374   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7375   ins_encode %{
 7376     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7377     if (to_elem_bt == T_LONG) {
 7378       int vlen_enc = vector_length_encoding(this);
 7379       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7380                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7381                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7382     } else {
 7383       int vlen_enc = vector_length_encoding(this, $src);
 7384       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7385                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7386                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7387     }
 7388   %}
 7389   ins_pipe( pipe_slow );
 7390 %}
 7391 
 7392 instruct vcastDtoF_reg(vec dst, vec src) %{
 7393   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7394   match(Set dst (VectorCastD2X src));
 7395   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7396   ins_encode %{
 7397     int vlen_enc = vector_length_encoding(this, $src);
 7398     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7399   %}
 7400   ins_pipe( pipe_slow );
 7401 %}
 7402 
 7403 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7404   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7405             is_integral_type(Matcher::vector_element_basic_type(n)));
 7406   match(Set dst (VectorCastD2X src));
 7407   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7408   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7409   ins_encode %{
 7410     int vlen_enc = vector_length_encoding(this, $src);
 7411     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7412     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7413                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7414                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7415   %}
 7416   ins_pipe( pipe_slow );
 7417 %}
 7418 
 7419 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7420   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7421             is_integral_type(Matcher::vector_element_basic_type(n)));
 7422   match(Set dst (VectorCastD2X src));
 7423   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7424   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7425   ins_encode %{
 7426     int vlen_enc = vector_length_encoding(this, $src);
 7427     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7428     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7429                               ExternalAddress(vector_float_signflip());
 7430     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7431                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7432   %}
 7433   ins_pipe( pipe_slow );
 7434 %}
 7435 
 7436 instruct vucast(vec dst, vec src) %{
 7437   match(Set dst (VectorUCastB2X src));
 7438   match(Set dst (VectorUCastS2X src));
 7439   match(Set dst (VectorUCastI2X src));
 7440   format %{ "vector_ucast $dst,$src\t!" %}
 7441   ins_encode %{
 7442     assert(UseAVX > 0, "required");
 7443 
 7444     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7445     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7446     int vlen_enc = vector_length_encoding(this);
 7447     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7448   %}
 7449   ins_pipe( pipe_slow );
 7450 %}
 7451 
 7452 #ifdef _LP64
 7453 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7454   predicate(!VM_Version::supports_avx512vl() &&
 7455             Matcher::vector_length_in_bytes(n) < 64 &&
 7456             Matcher::vector_element_basic_type(n) == T_INT);
 7457   match(Set dst (RoundVF src));
 7458   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7459   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7460   ins_encode %{
 7461     int vlen_enc = vector_length_encoding(this);
 7462     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7463     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7464                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7465                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7466   %}
 7467   ins_pipe( pipe_slow );
 7468 %}
 7469 
 7470 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7471   predicate((VM_Version::supports_avx512vl() ||
 7472              Matcher::vector_length_in_bytes(n) == 64) &&
 7473              Matcher::vector_element_basic_type(n) == T_INT);
 7474   match(Set dst (RoundVF src));
 7475   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7476   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7477   ins_encode %{
 7478     int vlen_enc = vector_length_encoding(this);
 7479     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7480     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7481                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7482                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7483   %}
 7484   ins_pipe( pipe_slow );
 7485 %}
 7486 
 7487 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7488   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7489   match(Set dst (RoundVD src));
 7490   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7491   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7492   ins_encode %{
 7493     int vlen_enc = vector_length_encoding(this);
 7494     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7495     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7496                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7497                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7498   %}
 7499   ins_pipe( pipe_slow );
 7500 %}
 7501 
 7502 #endif // _LP64
 7503 
 7504 // --------------------------------- VectorMaskCmp --------------------------------------
 7505 
 7506 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7507   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7508             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7509             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7510             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7511   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7512   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7513   ins_encode %{
 7514     int vlen_enc = vector_length_encoding(this, $src1);
 7515     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7516     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7517       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7518     } else {
 7519       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7520     }
 7521   %}
 7522   ins_pipe( pipe_slow );
 7523 %}
 7524 
 7525 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7526   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7527             n->bottom_type()->isa_vectmask() == nullptr &&
 7528             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7529   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7530   effect(TEMP ktmp);
 7531   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7532   ins_encode %{
 7533     int vlen_enc = Assembler::AVX_512bit;
 7534     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7535     KRegister mask = k0; // The comparison itself is not being masked.
 7536     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7537       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7538       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7539     } else {
 7540       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7541       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7542     }
 7543   %}
 7544   ins_pipe( pipe_slow );
 7545 %}
 7546 
 7547 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7548   predicate(n->bottom_type()->isa_vectmask() &&
 7549             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7550   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7551   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7552   ins_encode %{
 7553     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7554     int vlen_enc = vector_length_encoding(this, $src1);
 7555     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7556     KRegister mask = k0; // The comparison itself is not being masked.
 7557     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7558       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7559     } else {
 7560       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7561     }
 7562   %}
 7563   ins_pipe( pipe_slow );
 7564 %}
 7565 
 7566 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7567   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7568             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7569             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7570             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7571             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7572             (n->in(2)->get_int() == BoolTest::eq ||
 7573              n->in(2)->get_int() == BoolTest::lt ||
 7574              n->in(2)->get_int() == BoolTest::gt)); // cond
 7575   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7576   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7577   ins_encode %{
 7578     int vlen_enc = vector_length_encoding(this, $src1);
 7579     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7580     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7581     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7582   %}
 7583   ins_pipe( pipe_slow );
 7584 %}
 7585 
 7586 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7587   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7588             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7589             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7590             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7591             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7592             (n->in(2)->get_int() == BoolTest::ne ||
 7593              n->in(2)->get_int() == BoolTest::le ||
 7594              n->in(2)->get_int() == BoolTest::ge)); // cond
 7595   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7596   effect(TEMP dst, TEMP xtmp);
 7597   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7598   ins_encode %{
 7599     int vlen_enc = vector_length_encoding(this, $src1);
 7600     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7601     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7602     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7603   %}
 7604   ins_pipe( pipe_slow );
 7605 %}
 7606 
 7607 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7608   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7609             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7610             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7611             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7612             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7613   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7614   effect(TEMP dst, TEMP xtmp);
 7615   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7616   ins_encode %{
 7617     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7618     int vlen_enc = vector_length_encoding(this, $src1);
 7619     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7620     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7621 
 7622     if (vlen_enc == Assembler::AVX_128bit) {
 7623       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7624     } else {
 7625       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7626     }
 7627     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7628     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7629     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7630   %}
 7631   ins_pipe( pipe_slow );
 7632 %}
 7633 
 7634 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7635   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 7636              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7637              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7638   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7639   effect(TEMP ktmp);
 7640   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7641   ins_encode %{
 7642     assert(UseAVX > 2, "required");
 7643 
 7644     int vlen_enc = vector_length_encoding(this, $src1);
 7645     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7646     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7647     KRegister mask = k0; // The comparison itself is not being masked.
 7648     bool merge = false;
 7649     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7650 
 7651     switch (src1_elem_bt) {
 7652       case T_INT: {
 7653         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7654         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7655         break;
 7656       }
 7657       case T_LONG: {
 7658         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7659         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7660         break;
 7661       }
 7662       default: assert(false, "%s", type2name(src1_elem_bt));
 7663     }
 7664   %}
 7665   ins_pipe( pipe_slow );
 7666 %}
 7667 
 7668 
 7669 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7670   predicate(n->bottom_type()->isa_vectmask() &&
 7671             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7672   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7673   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7674   ins_encode %{
 7675     assert(UseAVX > 2, "required");
 7676     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7677 
 7678     int vlen_enc = vector_length_encoding(this, $src1);
 7679     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7680     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7681     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7682 
 7683     // Comparison i
 7684     switch (src1_elem_bt) {
 7685       case T_BYTE: {
 7686         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7687         break;
 7688       }
 7689       case T_SHORT: {
 7690         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7691         break;
 7692       }
 7693       case T_INT: {
 7694         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7695         break;
 7696       }
 7697       case T_LONG: {
 7698         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7699         break;
 7700       }
 7701       default: assert(false, "%s", type2name(src1_elem_bt));
 7702     }
 7703   %}
 7704   ins_pipe( pipe_slow );
 7705 %}
 7706 
 7707 // Extract
 7708 
 7709 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7710   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7711   match(Set dst (ExtractI src idx));
 7712   match(Set dst (ExtractS src idx));
 7713 #ifdef _LP64
 7714   match(Set dst (ExtractB src idx));
 7715 #endif
 7716   format %{ "extractI $dst,$src,$idx\t!" %}
 7717   ins_encode %{
 7718     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7719 
 7720     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7721     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7722   %}
 7723   ins_pipe( pipe_slow );
 7724 %}
 7725 
 7726 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7727   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7728             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7729   match(Set dst (ExtractI src idx));
 7730   match(Set dst (ExtractS src idx));
 7731 #ifdef _LP64
 7732   match(Set dst (ExtractB src idx));
 7733 #endif
 7734   effect(TEMP vtmp);
 7735   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7736   ins_encode %{
 7737     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7738 
 7739     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7740     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7741     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7742   %}
 7743   ins_pipe( pipe_slow );
 7744 %}
 7745 
 7746 #ifdef _LP64
 7747 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7748   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7749   match(Set dst (ExtractL src idx));
 7750   format %{ "extractL $dst,$src,$idx\t!" %}
 7751   ins_encode %{
 7752     assert(UseSSE >= 4, "required");
 7753     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7754 
 7755     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7756   %}
 7757   ins_pipe( pipe_slow );
 7758 %}
 7759 
 7760 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7761   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7762             Matcher::vector_length(n->in(1)) == 8);  // src
 7763   match(Set dst (ExtractL src idx));
 7764   effect(TEMP vtmp);
 7765   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7766   ins_encode %{
 7767     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7768 
 7769     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7770     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7771   %}
 7772   ins_pipe( pipe_slow );
 7773 %}
 7774 #endif
 7775 
 7776 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7777   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7778   match(Set dst (ExtractF src idx));
 7779   effect(TEMP dst, TEMP vtmp);
 7780   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7781   ins_encode %{
 7782     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7783 
 7784     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7785   %}
 7786   ins_pipe( pipe_slow );
 7787 %}
 7788 
 7789 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7790   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7791             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7792   match(Set dst (ExtractF src idx));
 7793   effect(TEMP vtmp);
 7794   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7795   ins_encode %{
 7796     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7797 
 7798     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7799     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7800   %}
 7801   ins_pipe( pipe_slow );
 7802 %}
 7803 
 7804 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7805   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7806   match(Set dst (ExtractD src idx));
 7807   format %{ "extractD $dst,$src,$idx\t!" %}
 7808   ins_encode %{
 7809     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7810 
 7811     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7812   %}
 7813   ins_pipe( pipe_slow );
 7814 %}
 7815 
 7816 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7817   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7818             Matcher::vector_length(n->in(1)) == 8);  // src
 7819   match(Set dst (ExtractD src idx));
 7820   effect(TEMP vtmp);
 7821   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7822   ins_encode %{
 7823     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7824 
 7825     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7826     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7827   %}
 7828   ins_pipe( pipe_slow );
 7829 %}
 7830 
 7831 // --------------------------------- Vector Blend --------------------------------------
 7832 
 7833 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7834   predicate(UseAVX == 0);
 7835   match(Set dst (VectorBlend (Binary dst src) mask));
 7836   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7837   effect(TEMP tmp);
 7838   ins_encode %{
 7839     assert(UseSSE >= 4, "required");
 7840 
 7841     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7842       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7843     }
 7844     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7845   %}
 7846   ins_pipe( pipe_slow );
 7847 %}
 7848 
 7849 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7850   predicate(UseAVX > 0 &&
 7851             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7852             Matcher::vector_length_in_bytes(n) <= 32 &&
 7853             is_integral_type(Matcher::vector_element_basic_type(n)));
 7854   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7855   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7856   ins_encode %{
 7857     int vlen_enc = vector_length_encoding(this);
 7858     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7859   %}
 7860   ins_pipe( pipe_slow );
 7861 %}
 7862 
 7863 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7864   predicate(UseAVX > 0 &&
 7865             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 7866             Matcher::vector_length_in_bytes(n) <= 32 &&
 7867             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7868   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7869   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7870   ins_encode %{
 7871     int vlen_enc = vector_length_encoding(this);
 7872     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7873   %}
 7874   ins_pipe( pipe_slow );
 7875 %}
 7876 
 7877 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7878   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7879             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 7880   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7881   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7882   effect(TEMP ktmp);
 7883   ins_encode %{
 7884      int vlen_enc = Assembler::AVX_512bit;
 7885      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7886     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7887     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7888   %}
 7889   ins_pipe( pipe_slow );
 7890 %}
 7891 
 7892 
 7893 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7894   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7895             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7896              VM_Version::supports_avx512bw()));
 7897   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7898   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7899   ins_encode %{
 7900     int vlen_enc = vector_length_encoding(this);
 7901     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7902     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7903   %}
 7904   ins_pipe( pipe_slow );
 7905 %}
 7906 
 7907 // --------------------------------- ABS --------------------------------------
 7908 // a = |a|
 7909 instruct vabsB_reg(vec dst, vec src) %{
 7910   match(Set dst (AbsVB  src));
 7911   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7912   ins_encode %{
 7913     uint vlen = Matcher::vector_length(this);
 7914     if (vlen <= 16) {
 7915       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7916     } else {
 7917       int vlen_enc = vector_length_encoding(this);
 7918       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7919     }
 7920   %}
 7921   ins_pipe( pipe_slow );
 7922 %}
 7923 
 7924 instruct vabsS_reg(vec dst, vec src) %{
 7925   match(Set dst (AbsVS  src));
 7926   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7927   ins_encode %{
 7928     uint vlen = Matcher::vector_length(this);
 7929     if (vlen <= 8) {
 7930       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7931     } else {
 7932       int vlen_enc = vector_length_encoding(this);
 7933       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7934     }
 7935   %}
 7936   ins_pipe( pipe_slow );
 7937 %}
 7938 
 7939 instruct vabsI_reg(vec dst, vec src) %{
 7940   match(Set dst (AbsVI  src));
 7941   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 7942   ins_encode %{
 7943     uint vlen = Matcher::vector_length(this);
 7944     if (vlen <= 4) {
 7945       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 7946     } else {
 7947       int vlen_enc = vector_length_encoding(this);
 7948       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7949     }
 7950   %}
 7951   ins_pipe( pipe_slow );
 7952 %}
 7953 
 7954 instruct vabsL_reg(vec dst, vec src) %{
 7955   match(Set dst (AbsVL  src));
 7956   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 7957   ins_encode %{
 7958     assert(UseAVX > 2, "required");
 7959     int vlen_enc = vector_length_encoding(this);
 7960     if (!VM_Version::supports_avx512vl()) {
 7961       vlen_enc = Assembler::AVX_512bit;
 7962     }
 7963     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7964   %}
 7965   ins_pipe( pipe_slow );
 7966 %}
 7967 
 7968 // --------------------------------- ABSNEG --------------------------------------
 7969 
 7970 instruct vabsnegF(vec dst, vec src) %{
 7971   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 7972   match(Set dst (AbsVF src));
 7973   match(Set dst (NegVF src));
 7974   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 7975   ins_cost(150);
 7976   ins_encode %{
 7977     int opcode = this->ideal_Opcode();
 7978     int vlen = Matcher::vector_length(this);
 7979     if (vlen == 2) {
 7980       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 7981     } else {
 7982       assert(vlen == 8 || vlen == 16, "required");
 7983       int vlen_enc = vector_length_encoding(this);
 7984       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7985     }
 7986   %}
 7987   ins_pipe( pipe_slow );
 7988 %}
 7989 
 7990 instruct vabsneg4F(vec dst) %{
 7991   predicate(Matcher::vector_length(n) == 4);
 7992   match(Set dst (AbsVF dst));
 7993   match(Set dst (NegVF dst));
 7994   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 7995   ins_cost(150);
 7996   ins_encode %{
 7997     int opcode = this->ideal_Opcode();
 7998     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 7999   %}
 8000   ins_pipe( pipe_slow );
 8001 %}
 8002 
 8003 instruct vabsnegD(vec dst, vec src) %{
 8004   match(Set dst (AbsVD  src));
 8005   match(Set dst (NegVD  src));
 8006   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8007   ins_encode %{
 8008     int opcode = this->ideal_Opcode();
 8009     uint vlen = Matcher::vector_length(this);
 8010     if (vlen == 2) {
 8011       assert(UseSSE >= 2, "required");
 8012       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8013     } else {
 8014       int vlen_enc = vector_length_encoding(this);
 8015       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8016     }
 8017   %}
 8018   ins_pipe( pipe_slow );
 8019 %}
 8020 
 8021 //------------------------------------- VectorTest --------------------------------------------
 8022 
 8023 #ifdef _LP64
 8024 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8025   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8026   match(Set cr (VectorTest src1 src2));
 8027   effect(TEMP vtmp);
 8028   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8029   ins_encode %{
 8030     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8031     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8032     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8033   %}
 8034   ins_pipe( pipe_slow );
 8035 %}
 8036 
 8037 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8038   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8039   match(Set cr (VectorTest src1 src2));
 8040   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8041   ins_encode %{
 8042     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8043     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8044     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8045   %}
 8046   ins_pipe( pipe_slow );
 8047 %}
 8048 
 8049 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8050   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8051              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8052             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8053   match(Set cr (VectorTest src1 src2));
 8054   effect(TEMP tmp);
 8055   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8056   ins_encode %{
 8057     uint masklen = Matcher::vector_length(this, $src1);
 8058     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8059     __ andl($tmp$$Register, (1 << masklen) - 1);
 8060     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8061   %}
 8062   ins_pipe( pipe_slow );
 8063 %}
 8064 
 8065 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8066   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8067              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8068             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8069   match(Set cr (VectorTest src1 src2));
 8070   effect(TEMP tmp);
 8071   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8072   ins_encode %{
 8073     uint masklen = Matcher::vector_length(this, $src1);
 8074     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8075     __ andl($tmp$$Register, (1 << masklen) - 1);
 8076   %}
 8077   ins_pipe( pipe_slow );
 8078 %}
 8079 
 8080 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8081   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8082             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8083   match(Set cr (VectorTest src1 src2));
 8084   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8085   ins_encode %{
 8086     uint masklen = Matcher::vector_length(this, $src1);
 8087     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8088   %}
 8089   ins_pipe( pipe_slow );
 8090 %}
 8091 #endif
 8092 
 8093 //------------------------------------- LoadMask --------------------------------------------
 8094 
 8095 instruct loadMask(legVec dst, legVec src) %{
 8096   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8097   match(Set dst (VectorLoadMask src));
 8098   effect(TEMP dst);
 8099   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8100   ins_encode %{
 8101     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8102     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8103     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8104   %}
 8105   ins_pipe( pipe_slow );
 8106 %}
 8107 
 8108 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8109   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8110   match(Set dst (VectorLoadMask src));
 8111   effect(TEMP xtmp);
 8112   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8113   ins_encode %{
 8114     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8115                         true, Assembler::AVX_512bit);
 8116   %}
 8117   ins_pipe( pipe_slow );
 8118 %}
 8119 
 8120 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8121   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8122   match(Set dst (VectorLoadMask src));
 8123   effect(TEMP xtmp);
 8124   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8125   ins_encode %{
 8126     int vlen_enc = vector_length_encoding(in(1));
 8127     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8128                         false, vlen_enc);
 8129   %}
 8130   ins_pipe( pipe_slow );
 8131 %}
 8132 
 8133 //------------------------------------- StoreMask --------------------------------------------
 8134 
 8135 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8136   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8137   match(Set dst (VectorStoreMask src size));
 8138   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8139   ins_encode %{
 8140     int vlen = Matcher::vector_length(this);
 8141     if (vlen <= 16 && UseAVX <= 2) {
 8142       assert(UseSSE >= 3, "required");
 8143       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8144     } else {
 8145       assert(UseAVX > 0, "required");
 8146       int src_vlen_enc = vector_length_encoding(this, $src);
 8147       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8148     }
 8149   %}
 8150   ins_pipe( pipe_slow );
 8151 %}
 8152 
 8153 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8154   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8155   match(Set dst (VectorStoreMask src size));
 8156   effect(TEMP_DEF dst, TEMP xtmp);
 8157   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8158   ins_encode %{
 8159     int vlen_enc = Assembler::AVX_128bit;
 8160     int vlen = Matcher::vector_length(this);
 8161     if (vlen <= 8) {
 8162       assert(UseSSE >= 3, "required");
 8163       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8164       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8165       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8166     } else {
 8167       assert(UseAVX > 0, "required");
 8168       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8169       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8170       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8171     }
 8172   %}
 8173   ins_pipe( pipe_slow );
 8174 %}
 8175 
 8176 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8177   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8178   match(Set dst (VectorStoreMask src size));
 8179   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8180   effect(TEMP_DEF dst, TEMP xtmp);
 8181   ins_encode %{
 8182     int vlen_enc = Assembler::AVX_128bit;
 8183     int vlen = Matcher::vector_length(this);
 8184     if (vlen <= 4) {
 8185       assert(UseSSE >= 3, "required");
 8186       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8187       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8188       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8189       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8190     } else {
 8191       assert(UseAVX > 0, "required");
 8192       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8193       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8194       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8195       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8196       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8197     }
 8198   %}
 8199   ins_pipe( pipe_slow );
 8200 %}
 8201 
 8202 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8203   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8204   match(Set dst (VectorStoreMask src size));
 8205   effect(TEMP_DEF dst, TEMP xtmp);
 8206   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8207   ins_encode %{
 8208     assert(UseSSE >= 3, "required");
 8209     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8210     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8211     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8212     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8213     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8214   %}
 8215   ins_pipe( pipe_slow );
 8216 %}
 8217 
 8218 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8219   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8220   match(Set dst (VectorStoreMask src size));
 8221   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8222   effect(TEMP_DEF dst, TEMP vtmp);
 8223   ins_encode %{
 8224     int vlen_enc = Assembler::AVX_128bit;
 8225     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8226     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8227     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8228     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8229     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8230     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8231     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8232   %}
 8233   ins_pipe( pipe_slow );
 8234 %}
 8235 
 8236 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8237   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8238   match(Set dst (VectorStoreMask src size));
 8239   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8240   ins_encode %{
 8241     int src_vlen_enc = vector_length_encoding(this, $src);
 8242     int dst_vlen_enc = vector_length_encoding(this);
 8243     if (!VM_Version::supports_avx512vl()) {
 8244       src_vlen_enc = Assembler::AVX_512bit;
 8245     }
 8246     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8247     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8248   %}
 8249   ins_pipe( pipe_slow );
 8250 %}
 8251 
 8252 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8253   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8254   match(Set dst (VectorStoreMask src size));
 8255   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8256   ins_encode %{
 8257     int src_vlen_enc = vector_length_encoding(this, $src);
 8258     int dst_vlen_enc = vector_length_encoding(this);
 8259     if (!VM_Version::supports_avx512vl()) {
 8260       src_vlen_enc = Assembler::AVX_512bit;
 8261     }
 8262     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8263     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8264   %}
 8265   ins_pipe( pipe_slow );
 8266 %}
 8267 
 8268 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8269   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8270   match(Set dst (VectorStoreMask mask size));
 8271   effect(TEMP_DEF dst);
 8272   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8273   ins_encode %{
 8274     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8275     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8276                  false, Assembler::AVX_512bit, noreg);
 8277     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8278   %}
 8279   ins_pipe( pipe_slow );
 8280 %}
 8281 
 8282 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8283   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8284   match(Set dst (VectorStoreMask mask size));
 8285   effect(TEMP_DEF dst);
 8286   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8287   ins_encode %{
 8288     int dst_vlen_enc = vector_length_encoding(this);
 8289     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8290     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8291   %}
 8292   ins_pipe( pipe_slow );
 8293 %}
 8294 
 8295 instruct vmaskcast_evex(kReg dst) %{
 8296   match(Set dst (VectorMaskCast dst));
 8297   ins_cost(0);
 8298   format %{ "vector_mask_cast $dst" %}
 8299   ins_encode %{
 8300     // empty
 8301   %}
 8302   ins_pipe(empty);
 8303 %}
 8304 
 8305 instruct vmaskcast(vec dst) %{
 8306   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8307   match(Set dst (VectorMaskCast dst));
 8308   ins_cost(0);
 8309   format %{ "vector_mask_cast $dst" %}
 8310   ins_encode %{
 8311     // empty
 8312   %}
 8313   ins_pipe(empty);
 8314 %}
 8315 
 8316 instruct vmaskcast_avx(vec dst, vec src) %{
 8317   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8318   match(Set dst (VectorMaskCast src));
 8319   format %{ "vector_mask_cast $dst, $src" %}
 8320   ins_encode %{
 8321     int vlen = Matcher::vector_length(this);
 8322     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8323     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8324     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8325   %}
 8326   ins_pipe(pipe_slow);
 8327 %}
 8328 
 8329 //-------------------------------- Load Iota Indices ----------------------------------
 8330 
 8331 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8332   match(Set dst (VectorLoadConst src));
 8333   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8334   ins_encode %{
 8335      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8336      BasicType bt = Matcher::vector_element_basic_type(this);
 8337      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8338   %}
 8339   ins_pipe( pipe_slow );
 8340 %}
 8341 
 8342 #ifdef _LP64
 8343 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8344   match(Set dst (PopulateIndex src1 src2));
 8345   effect(TEMP dst, TEMP vtmp);
 8346   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8347   ins_encode %{
 8348      assert($src2$$constant == 1, "required");
 8349      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8350      int vlen_enc = vector_length_encoding(this);
 8351      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8352      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8353      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8354      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8355   %}
 8356   ins_pipe( pipe_slow );
 8357 %}
 8358 
 8359 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8360   match(Set dst (PopulateIndex src1 src2));
 8361   effect(TEMP dst, TEMP vtmp);
 8362   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8363   ins_encode %{
 8364      assert($src2$$constant == 1, "required");
 8365      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8366      int vlen_enc = vector_length_encoding(this);
 8367      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8368      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8369      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8370      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8371   %}
 8372   ins_pipe( pipe_slow );
 8373 %}
 8374 #endif
 8375 //-------------------------------- Rearrange ----------------------------------
 8376 
 8377 // LoadShuffle/Rearrange for Byte
 8378 
 8379 instruct loadShuffleB(vec dst) %{
 8380   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8381   match(Set dst (VectorLoadShuffle dst));
 8382   format %{ "vector_load_shuffle $dst, $dst" %}
 8383   ins_encode %{
 8384     // empty
 8385   %}
 8386   ins_pipe( pipe_slow );
 8387 %}
 8388 
 8389 instruct rearrangeB(vec dst, vec shuffle) %{
 8390   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8391             Matcher::vector_length(n) < 32);
 8392   match(Set dst (VectorRearrange dst shuffle));
 8393   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8394   ins_encode %{
 8395     assert(UseSSE >= 4, "required");
 8396     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8397   %}
 8398   ins_pipe( pipe_slow );
 8399 %}
 8400 
 8401 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8402   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8403             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8404   match(Set dst (VectorRearrange src shuffle));
 8405   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8406   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8407   ins_encode %{
 8408     assert(UseAVX >= 2, "required");
 8409     // Swap src into vtmp1
 8410     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8411     // Shuffle swapped src to get entries from other 128 bit lane
 8412     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8413     // Shuffle original src to get entries from self 128 bit lane
 8414     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8415     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8416     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8417     // Perform the blend
 8418     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8419   %}
 8420   ins_pipe( pipe_slow );
 8421 %}
 8422 
 8423 
 8424 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8425   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8426             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8427   match(Set dst (VectorRearrange src shuffle));
 8428   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8429   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8430   ins_encode %{
 8431     int vlen_enc = vector_length_encoding(this);
 8432     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8433                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8434                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8435   %}
 8436   ins_pipe( pipe_slow );
 8437 %}
 8438 
 8439 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8440   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8441             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8442   match(Set dst (VectorRearrange src shuffle));
 8443   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8444   ins_encode %{
 8445     int vlen_enc = vector_length_encoding(this);
 8446     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8447   %}
 8448   ins_pipe( pipe_slow );
 8449 %}
 8450 
 8451 // LoadShuffle/Rearrange for Short
 8452 
 8453 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8454   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8455             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8456   match(Set dst (VectorLoadShuffle src));
 8457   effect(TEMP dst, TEMP vtmp);
 8458   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8459   ins_encode %{
 8460     // Create a byte shuffle mask from short shuffle mask
 8461     // only byte shuffle instruction available on these platforms
 8462     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8463     if (UseAVX == 0) {
 8464       assert(vlen_in_bytes <= 16, "required");
 8465       // Multiply each shuffle by two to get byte index
 8466       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8467       __ psllw($vtmp$$XMMRegister, 1);
 8468 
 8469       // Duplicate to create 2 copies of byte index
 8470       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8471       __ psllw($dst$$XMMRegister, 8);
 8472       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8473 
 8474       // Add one to get alternate byte index
 8475       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8476       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8477     } else {
 8478       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8479       int vlen_enc = vector_length_encoding(this);
 8480       // Multiply each shuffle by two to get byte index
 8481       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8482       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8483 
 8484       // Duplicate to create 2 copies of byte index
 8485       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8486       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8487 
 8488       // Add one to get alternate byte index
 8489       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8490     }
 8491   %}
 8492   ins_pipe( pipe_slow );
 8493 %}
 8494 
 8495 instruct rearrangeS(vec dst, vec shuffle) %{
 8496   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8497             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8498   match(Set dst (VectorRearrange dst shuffle));
 8499   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8500   ins_encode %{
 8501     assert(UseSSE >= 4, "required");
 8502     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8503   %}
 8504   ins_pipe( pipe_slow );
 8505 %}
 8506 
 8507 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8508   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8509             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8510   match(Set dst (VectorRearrange src shuffle));
 8511   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8512   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8513   ins_encode %{
 8514     assert(UseAVX >= 2, "required");
 8515     // Swap src into vtmp1
 8516     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8517     // Shuffle swapped src to get entries from other 128 bit lane
 8518     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8519     // Shuffle original src to get entries from self 128 bit lane
 8520     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8521     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8522     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8523     // Perform the blend
 8524     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8525   %}
 8526   ins_pipe( pipe_slow );
 8527 %}
 8528 
 8529 instruct loadShuffleS_evex(vec dst, vec src) %{
 8530   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8531             VM_Version::supports_avx512bw());
 8532   match(Set dst (VectorLoadShuffle src));
 8533   format %{ "vector_load_shuffle $dst, $src" %}
 8534   ins_encode %{
 8535     int vlen_enc = vector_length_encoding(this);
 8536     if (!VM_Version::supports_avx512vl()) {
 8537       vlen_enc = Assembler::AVX_512bit;
 8538     }
 8539     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8540   %}
 8541   ins_pipe( pipe_slow );
 8542 %}
 8543 
 8544 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8545   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8546             VM_Version::supports_avx512bw());
 8547   match(Set dst (VectorRearrange src shuffle));
 8548   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8549   ins_encode %{
 8550     int vlen_enc = vector_length_encoding(this);
 8551     if (!VM_Version::supports_avx512vl()) {
 8552       vlen_enc = Assembler::AVX_512bit;
 8553     }
 8554     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8555   %}
 8556   ins_pipe( pipe_slow );
 8557 %}
 8558 
 8559 // LoadShuffle/Rearrange for Integer and Float
 8560 
 8561 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8562   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8563             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8564   match(Set dst (VectorLoadShuffle src));
 8565   effect(TEMP dst, TEMP vtmp);
 8566   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8567   ins_encode %{
 8568     assert(UseSSE >= 4, "required");
 8569 
 8570     // Create a byte shuffle mask from int shuffle mask
 8571     // only byte shuffle instruction available on these platforms
 8572 
 8573     // Duplicate and multiply each shuffle by 4
 8574     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8575     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8576     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8577     __ psllw($vtmp$$XMMRegister, 2);
 8578 
 8579     // Duplicate again to create 4 copies of byte index
 8580     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8581     __ psllw($dst$$XMMRegister, 8);
 8582     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8583 
 8584     // Add 3,2,1,0 to get alternate byte index
 8585     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8586     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8587   %}
 8588   ins_pipe( pipe_slow );
 8589 %}
 8590 
 8591 instruct rearrangeI(vec dst, vec shuffle) %{
 8592   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8593             UseAVX == 0);
 8594   match(Set dst (VectorRearrange dst shuffle));
 8595   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8596   ins_encode %{
 8597     assert(UseSSE >= 4, "required");
 8598     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8599   %}
 8600   ins_pipe( pipe_slow );
 8601 %}
 8602 
 8603 instruct loadShuffleI_avx(vec dst, vec src) %{
 8604   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8605             UseAVX > 0);
 8606   match(Set dst (VectorLoadShuffle src));
 8607   format %{ "vector_load_shuffle $dst, $src" %}
 8608   ins_encode %{
 8609     int vlen_enc = vector_length_encoding(this);
 8610     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8611   %}
 8612   ins_pipe( pipe_slow );
 8613 %}
 8614 
 8615 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8616   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8617             UseAVX > 0);
 8618   match(Set dst (VectorRearrange src shuffle));
 8619   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8620   ins_encode %{
 8621     int vlen_enc = vector_length_encoding(this);
 8622     BasicType bt = Matcher::vector_element_basic_type(this);
 8623     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8624   %}
 8625   ins_pipe( pipe_slow );
 8626 %}
 8627 
 8628 // LoadShuffle/Rearrange for Long and Double
 8629 
 8630 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8631   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8632             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8633   match(Set dst (VectorLoadShuffle src));
 8634   effect(TEMP dst, TEMP vtmp);
 8635   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8636   ins_encode %{
 8637     assert(UseAVX >= 2, "required");
 8638 
 8639     int vlen_enc = vector_length_encoding(this);
 8640     // Create a double word shuffle mask from long shuffle mask
 8641     // only double word shuffle instruction available on these platforms
 8642 
 8643     // Multiply each shuffle by two to get double word index
 8644     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8645     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8646 
 8647     // Duplicate each double word shuffle
 8648     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8649     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8650 
 8651     // Add one to get alternate double word index
 8652     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8653   %}
 8654   ins_pipe( pipe_slow );
 8655 %}
 8656 
 8657 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8658   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8659             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8660   match(Set dst (VectorRearrange src shuffle));
 8661   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8662   ins_encode %{
 8663     assert(UseAVX >= 2, "required");
 8664 
 8665     int vlen_enc = vector_length_encoding(this);
 8666     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8667   %}
 8668   ins_pipe( pipe_slow );
 8669 %}
 8670 
 8671 instruct loadShuffleL_evex(vec dst, vec src) %{
 8672   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8673             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8674   match(Set dst (VectorLoadShuffle src));
 8675   format %{ "vector_load_shuffle $dst, $src" %}
 8676   ins_encode %{
 8677     assert(UseAVX > 2, "required");
 8678 
 8679     int vlen_enc = vector_length_encoding(this);
 8680     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8681   %}
 8682   ins_pipe( pipe_slow );
 8683 %}
 8684 
 8685 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8686   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8687             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8688   match(Set dst (VectorRearrange src shuffle));
 8689   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8690   ins_encode %{
 8691     assert(UseAVX > 2, "required");
 8692 
 8693     int vlen_enc = vector_length_encoding(this);
 8694     if (vlen_enc == Assembler::AVX_128bit) {
 8695       vlen_enc = Assembler::AVX_256bit;
 8696     }
 8697     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8698   %}
 8699   ins_pipe( pipe_slow );
 8700 %}
 8701 
 8702 // --------------------------------- FMA --------------------------------------
 8703 // a * b + c
 8704 
 8705 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8706   match(Set c (FmaVF  c (Binary a b)));
 8707   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8708   ins_cost(150);
 8709   ins_encode %{
 8710     assert(UseFMA, "not enabled");
 8711     int vlen_enc = vector_length_encoding(this);
 8712     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8713   %}
 8714   ins_pipe( pipe_slow );
 8715 %}
 8716 
 8717 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8718   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8719   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8720   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8721   ins_cost(150);
 8722   ins_encode %{
 8723     assert(UseFMA, "not enabled");
 8724     int vlen_enc = vector_length_encoding(this);
 8725     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8726   %}
 8727   ins_pipe( pipe_slow );
 8728 %}
 8729 
 8730 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8731   match(Set c (FmaVD  c (Binary a b)));
 8732   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8733   ins_cost(150);
 8734   ins_encode %{
 8735     assert(UseFMA, "not enabled");
 8736     int vlen_enc = vector_length_encoding(this);
 8737     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8738   %}
 8739   ins_pipe( pipe_slow );
 8740 %}
 8741 
 8742 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8743   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8744   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8745   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8746   ins_cost(150);
 8747   ins_encode %{
 8748     assert(UseFMA, "not enabled");
 8749     int vlen_enc = vector_length_encoding(this);
 8750     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8751   %}
 8752   ins_pipe( pipe_slow );
 8753 %}
 8754 
 8755 // --------------------------------- Vector Multiply Add --------------------------------------
 8756 
 8757 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8758   predicate(UseAVX == 0);
 8759   match(Set dst (MulAddVS2VI dst src1));
 8760   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8761   ins_encode %{
 8762     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8763   %}
 8764   ins_pipe( pipe_slow );
 8765 %}
 8766 
 8767 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8768   predicate(UseAVX > 0);
 8769   match(Set dst (MulAddVS2VI src1 src2));
 8770   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8771   ins_encode %{
 8772     int vlen_enc = vector_length_encoding(this);
 8773     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8774   %}
 8775   ins_pipe( pipe_slow );
 8776 %}
 8777 
 8778 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8779 
 8780 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8781   predicate(VM_Version::supports_avx512_vnni());
 8782   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8783   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8784   ins_encode %{
 8785     assert(UseAVX > 2, "required");
 8786     int vlen_enc = vector_length_encoding(this);
 8787     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8788   %}
 8789   ins_pipe( pipe_slow );
 8790   ins_cost(10);
 8791 %}
 8792 
 8793 // --------------------------------- PopCount --------------------------------------
 8794 
 8795 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8796   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8797   match(Set dst (PopCountVI src));
 8798   match(Set dst (PopCountVL src));
 8799   format %{ "vector_popcount_integral $dst, $src" %}
 8800   ins_encode %{
 8801     int opcode = this->ideal_Opcode();
 8802     int vlen_enc = vector_length_encoding(this, $src);
 8803     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8804     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8805   %}
 8806   ins_pipe( pipe_slow );
 8807 %}
 8808 
 8809 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8810   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8811   match(Set dst (PopCountVI src mask));
 8812   match(Set dst (PopCountVL src mask));
 8813   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8814   ins_encode %{
 8815     int vlen_enc = vector_length_encoding(this, $src);
 8816     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8817     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8818     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8819   %}
 8820   ins_pipe( pipe_slow );
 8821 %}
 8822 
 8823 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8824   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8825   match(Set dst (PopCountVI src));
 8826   match(Set dst (PopCountVL src));
 8827   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8828   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8829   ins_encode %{
 8830     int opcode = this->ideal_Opcode();
 8831     int vlen_enc = vector_length_encoding(this, $src);
 8832     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8833     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8834                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8835   %}
 8836   ins_pipe( pipe_slow );
 8837 %}
 8838 
 8839 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8840 
 8841 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8842   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8843                                               Matcher::vector_length_in_bytes(n->in(1))));
 8844   match(Set dst (CountTrailingZerosV src));
 8845   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8846   ins_cost(400);
 8847   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8848   ins_encode %{
 8849     int vlen_enc = vector_length_encoding(this, $src);
 8850     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8851     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8852                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8853   %}
 8854   ins_pipe( pipe_slow );
 8855 %}
 8856 
 8857 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8858   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8859             VM_Version::supports_avx512cd() &&
 8860             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8861   match(Set dst (CountTrailingZerosV src));
 8862   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8863   ins_cost(400);
 8864   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8865   ins_encode %{
 8866     int vlen_enc = vector_length_encoding(this, $src);
 8867     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8868     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8869                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8870   %}
 8871   ins_pipe( pipe_slow );
 8872 %}
 8873 
 8874 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8875   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8876   match(Set dst (CountTrailingZerosV src));
 8877   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8878   ins_cost(400);
 8879   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8880   ins_encode %{
 8881     int vlen_enc = vector_length_encoding(this, $src);
 8882     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8883     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8884                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8885                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8886   %}
 8887   ins_pipe( pipe_slow );
 8888 %}
 8889 
 8890 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8891   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8892   match(Set dst (CountTrailingZerosV src));
 8893   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8894   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8895   ins_encode %{
 8896     int vlen_enc = vector_length_encoding(this, $src);
 8897     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8898     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8899                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8900   %}
 8901   ins_pipe( pipe_slow );
 8902 %}
 8903 
 8904 
 8905 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8906 
 8907 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8908   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8909   effect(TEMP dst);
 8910   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8911   ins_encode %{
 8912     int vector_len = vector_length_encoding(this);
 8913     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8914   %}
 8915   ins_pipe( pipe_slow );
 8916 %}
 8917 
 8918 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8919   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8920   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8921   effect(TEMP dst);
 8922   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8923   ins_encode %{
 8924     int vector_len = vector_length_encoding(this);
 8925     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8926   %}
 8927   ins_pipe( pipe_slow );
 8928 %}
 8929 
 8930 // --------------------------------- Rotation Operations ----------------------------------
 8931 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8932   match(Set dst (RotateLeftV src shift));
 8933   match(Set dst (RotateRightV src shift));
 8934   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8935   ins_encode %{
 8936     int opcode      = this->ideal_Opcode();
 8937     int vector_len  = vector_length_encoding(this);
 8938     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8939     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 8940   %}
 8941   ins_pipe( pipe_slow );
 8942 %}
 8943 
 8944 instruct vprorate(vec dst, vec src, vec shift) %{
 8945   match(Set dst (RotateLeftV src shift));
 8946   match(Set dst (RotateRightV src shift));
 8947   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 8948   ins_encode %{
 8949     int opcode      = this->ideal_Opcode();
 8950     int vector_len  = vector_length_encoding(this);
 8951     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8952     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 8953   %}
 8954   ins_pipe( pipe_slow );
 8955 %}
 8956 
 8957 // ---------------------------------- Masked Operations ------------------------------------
 8958 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 8959   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 8960   match(Set dst (LoadVectorMasked mem mask));
 8961   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8962   ins_encode %{
 8963     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 8964     int vlen_enc = vector_length_encoding(this);
 8965     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 8966   %}
 8967   ins_pipe( pipe_slow );
 8968 %}
 8969 
 8970 
 8971 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 8972   predicate(n->in(3)->bottom_type()->isa_vectmask());
 8973   match(Set dst (LoadVectorMasked mem mask));
 8974   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8975   ins_encode %{
 8976     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 8977     int vector_len = vector_length_encoding(this);
 8978     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 8979   %}
 8980   ins_pipe( pipe_slow );
 8981 %}
 8982 
 8983 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 8984   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8985   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8986   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8987   ins_encode %{
 8988     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8989     int vlen_enc = vector_length_encoding(src_node);
 8990     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8991     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8992   %}
 8993   ins_pipe( pipe_slow );
 8994 %}
 8995 
 8996 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 8997   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8998   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8999   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9000   ins_encode %{
 9001     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9002     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9003     int vlen_enc = vector_length_encoding(src_node);
 9004     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9005   %}
 9006   ins_pipe( pipe_slow );
 9007 %}
 9008 
 9009 #ifdef _LP64
 9010 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9011   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9012   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9013   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9014   ins_encode %{
 9015     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9016     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9017 
 9018     Label DONE;
 9019     int vlen_enc = vector_length_encoding(this, $src1);
 9020     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9021 
 9022     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9023     __ mov64($dst$$Register, -1L);
 9024     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9025     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9026     __ jccb(Assembler::carrySet, DONE);
 9027     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9028     __ notq($dst$$Register);
 9029     __ tzcntq($dst$$Register, $dst$$Register);
 9030     __ bind(DONE);
 9031   %}
 9032   ins_pipe( pipe_slow );
 9033 %}
 9034 
 9035 
 9036 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
 9037   match(Set dst (VectorMaskGen len));
 9038   effect(TEMP temp);
 9039   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9040   ins_encode %{
 9041     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9042   %}
 9043   ins_pipe( pipe_slow );
 9044 %}
 9045 
 9046 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9047   match(Set dst (VectorMaskGen len));
 9048   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9049   effect(TEMP temp);
 9050   ins_encode %{
 9051     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9052     __ kmovql($dst$$KRegister, $temp$$Register);
 9053   %}
 9054   ins_pipe( pipe_slow );
 9055 %}
 9056 
 9057 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9058   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9059   match(Set dst (VectorMaskToLong mask));
 9060   effect(TEMP dst, KILL cr);
 9061   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9062   ins_encode %{
 9063     int opcode = this->ideal_Opcode();
 9064     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9065     int mask_len = Matcher::vector_length(this, $mask);
 9066     int mask_size = mask_len * type2aelembytes(mbt);
 9067     int vlen_enc = vector_length_encoding(this, $mask);
 9068     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9069                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9070   %}
 9071   ins_pipe( pipe_slow );
 9072 %}
 9073 
 9074 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9075   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9076   match(Set dst (VectorMaskToLong mask));
 9077   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9078   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9079   ins_encode %{
 9080     int opcode = this->ideal_Opcode();
 9081     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9082     int mask_len = Matcher::vector_length(this, $mask);
 9083     int vlen_enc = vector_length_encoding(this, $mask);
 9084     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9085                              $dst$$Register, mask_len, mbt, vlen_enc);
 9086   %}
 9087   ins_pipe( pipe_slow );
 9088 %}
 9089 
 9090 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9091   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9092   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9093   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9094   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9095   ins_encode %{
 9096     int opcode = this->ideal_Opcode();
 9097     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9098     int mask_len = Matcher::vector_length(this, $mask);
 9099     int vlen_enc = vector_length_encoding(this, $mask);
 9100     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9101                              $dst$$Register, mask_len, mbt, vlen_enc);
 9102   %}
 9103   ins_pipe( pipe_slow );
 9104 %}
 9105 
 9106 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9107   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9108   match(Set dst (VectorMaskTrueCount mask));
 9109   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9110   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9111   ins_encode %{
 9112     int opcode = this->ideal_Opcode();
 9113     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9114     int mask_len = Matcher::vector_length(this, $mask);
 9115     int mask_size = mask_len * type2aelembytes(mbt);
 9116     int vlen_enc = vector_length_encoding(this, $mask);
 9117     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9118                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9119   %}
 9120   ins_pipe( pipe_slow );
 9121 %}
 9122 
 9123 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9124   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9125   match(Set dst (VectorMaskTrueCount mask));
 9126   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9127   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9128   ins_encode %{
 9129     int opcode = this->ideal_Opcode();
 9130     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9131     int mask_len = Matcher::vector_length(this, $mask);
 9132     int vlen_enc = vector_length_encoding(this, $mask);
 9133     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9134                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9135   %}
 9136   ins_pipe( pipe_slow );
 9137 %}
 9138 
 9139 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9140   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9141   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9142   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9143   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9144   ins_encode %{
 9145     int opcode = this->ideal_Opcode();
 9146     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9147     int mask_len = Matcher::vector_length(this, $mask);
 9148     int vlen_enc = vector_length_encoding(this, $mask);
 9149     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9150                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9151   %}
 9152   ins_pipe( pipe_slow );
 9153 %}
 9154 
 9155 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9156   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9157   match(Set dst (VectorMaskFirstTrue mask));
 9158   match(Set dst (VectorMaskLastTrue mask));
 9159   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9160   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9161   ins_encode %{
 9162     int opcode = this->ideal_Opcode();
 9163     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9164     int mask_len = Matcher::vector_length(this, $mask);
 9165     int mask_size = mask_len * type2aelembytes(mbt);
 9166     int vlen_enc = vector_length_encoding(this, $mask);
 9167     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9168                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9169   %}
 9170   ins_pipe( pipe_slow );
 9171 %}
 9172 
 9173 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9174   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9175   match(Set dst (VectorMaskFirstTrue mask));
 9176   match(Set dst (VectorMaskLastTrue mask));
 9177   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9178   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9179   ins_encode %{
 9180     int opcode = this->ideal_Opcode();
 9181     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9182     int mask_len = Matcher::vector_length(this, $mask);
 9183     int vlen_enc = vector_length_encoding(this, $mask);
 9184     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9185                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9186   %}
 9187   ins_pipe( pipe_slow );
 9188 %}
 9189 
 9190 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9191   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9192   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9193   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9194   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9195   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9196   ins_encode %{
 9197     int opcode = this->ideal_Opcode();
 9198     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9199     int mask_len = Matcher::vector_length(this, $mask);
 9200     int vlen_enc = vector_length_encoding(this, $mask);
 9201     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9202                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9203   %}
 9204   ins_pipe( pipe_slow );
 9205 %}
 9206 
 9207 // --------------------------------- Compress/Expand Operations ---------------------------
 9208 
 9209 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9210   match(Set dst (CompressV src mask));
 9211   match(Set dst (ExpandV src mask));
 9212   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9213   ins_encode %{
 9214     int opcode = this->ideal_Opcode();
 9215     int vector_len = vector_length_encoding(this);
 9216     BasicType bt  = Matcher::vector_element_basic_type(this);
 9217     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9218   %}
 9219   ins_pipe( pipe_slow );
 9220 %}
 9221 
 9222 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9223   match(Set dst (CompressM mask));
 9224   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9225   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9226   ins_encode %{
 9227     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9228     int mask_len = Matcher::vector_length(this);
 9229     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9230   %}
 9231   ins_pipe( pipe_slow );
 9232 %}
 9233 
 9234 #endif // _LP64
 9235 
 9236 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9237 
 9238 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9239   predicate(!VM_Version::supports_gfni());
 9240   match(Set dst (ReverseV src));
 9241   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9242   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9243   ins_encode %{
 9244     int vec_enc = vector_length_encoding(this);
 9245     BasicType bt = Matcher::vector_element_basic_type(this);
 9246     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9247                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9248   %}
 9249   ins_pipe( pipe_slow );
 9250 %}
 9251 
 9252 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9253   predicate(VM_Version::supports_gfni());
 9254   match(Set dst (ReverseV src));
 9255   effect(TEMP dst, TEMP xtmp);
 9256   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9257   ins_encode %{
 9258     int vec_enc = vector_length_encoding(this);
 9259     BasicType bt  = Matcher::vector_element_basic_type(this);
 9260     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9261     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9262                                $xtmp$$XMMRegister);
 9263   %}
 9264   ins_pipe( pipe_slow );
 9265 %}
 9266 
 9267 instruct vreverse_byte_reg(vec dst, vec src) %{
 9268   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9269   match(Set dst (ReverseBytesV src));
 9270   effect(TEMP dst);
 9271   format %{ "vector_reverse_byte $dst, $src" %}
 9272   ins_encode %{
 9273     int vec_enc = vector_length_encoding(this);
 9274     BasicType bt = Matcher::vector_element_basic_type(this);
 9275     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9276   %}
 9277   ins_pipe( pipe_slow );
 9278 %}
 9279 
 9280 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9281   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9282   match(Set dst (ReverseBytesV src));
 9283   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9284   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9285   ins_encode %{
 9286     int vec_enc = vector_length_encoding(this);
 9287     BasicType bt = Matcher::vector_element_basic_type(this);
 9288     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9289                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9290   %}
 9291   ins_pipe( pipe_slow );
 9292 %}
 9293 
 9294 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9295 
 9296 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9297   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9298                                               Matcher::vector_length_in_bytes(n->in(1))));
 9299   match(Set dst (CountLeadingZerosV src));
 9300   format %{ "vector_count_leading_zeros $dst, $src" %}
 9301   ins_encode %{
 9302      int vlen_enc = vector_length_encoding(this, $src);
 9303      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9304      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9305                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9306   %}
 9307   ins_pipe( pipe_slow );
 9308 %}
 9309 
 9310 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9311   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9312                                               Matcher::vector_length_in_bytes(n->in(1))));
 9313   match(Set dst (CountLeadingZerosV src mask));
 9314   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9315   ins_encode %{
 9316     int vlen_enc = vector_length_encoding(this, $src);
 9317     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9318     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9319     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9320                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9321   %}
 9322   ins_pipe( pipe_slow );
 9323 %}
 9324 
 9325 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9326   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9327             VM_Version::supports_avx512cd() &&
 9328             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9329   match(Set dst (CountLeadingZerosV src));
 9330   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9331   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9332   ins_encode %{
 9333     int vlen_enc = vector_length_encoding(this, $src);
 9334     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9335     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9336                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9337   %}
 9338   ins_pipe( pipe_slow );
 9339 %}
 9340 
 9341 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9342   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9343   match(Set dst (CountLeadingZerosV src));
 9344   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9345   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9346   ins_encode %{
 9347     int vlen_enc = vector_length_encoding(this, $src);
 9348     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9349     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9350                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9351                                        $rtmp$$Register, true, vlen_enc);
 9352   %}
 9353   ins_pipe( pipe_slow );
 9354 %}
 9355 
 9356 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9357   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9358             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9359   match(Set dst (CountLeadingZerosV src));
 9360   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9361   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9362   ins_encode %{
 9363     int vlen_enc = vector_length_encoding(this, $src);
 9364     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9365     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9366                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9367   %}
 9368   ins_pipe( pipe_slow );
 9369 %}
 9370 
 9371 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9372   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9373             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9374   match(Set dst (CountLeadingZerosV src));
 9375   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9376   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9377   ins_encode %{
 9378     int vlen_enc = vector_length_encoding(this, $src);
 9379     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9380     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9381                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9382   %}
 9383   ins_pipe( pipe_slow );
 9384 %}
 9385 
 9386 // ---------------------------------- Vector Masked Operations ------------------------------------
 9387 
 9388 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9389   match(Set dst (AddVB (Binary dst src2) mask));
 9390   match(Set dst (AddVS (Binary dst src2) mask));
 9391   match(Set dst (AddVI (Binary dst src2) mask));
 9392   match(Set dst (AddVL (Binary dst src2) mask));
 9393   match(Set dst (AddVF (Binary dst src2) mask));
 9394   match(Set dst (AddVD (Binary dst src2) mask));
 9395   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9396   ins_encode %{
 9397     int vlen_enc = vector_length_encoding(this);
 9398     BasicType bt = Matcher::vector_element_basic_type(this);
 9399     int opc = this->ideal_Opcode();
 9400     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9401                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9402   %}
 9403   ins_pipe( pipe_slow );
 9404 %}
 9405 
 9406 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9407   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9408   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9409   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9410   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9411   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9412   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9413   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9414   ins_encode %{
 9415     int vlen_enc = vector_length_encoding(this);
 9416     BasicType bt = Matcher::vector_element_basic_type(this);
 9417     int opc = this->ideal_Opcode();
 9418     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9419                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9420   %}
 9421   ins_pipe( pipe_slow );
 9422 %}
 9423 
 9424 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9425   match(Set dst (XorV (Binary dst src2) mask));
 9426   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9427   ins_encode %{
 9428     int vlen_enc = vector_length_encoding(this);
 9429     BasicType bt = Matcher::vector_element_basic_type(this);
 9430     int opc = this->ideal_Opcode();
 9431     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9432                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9433   %}
 9434   ins_pipe( pipe_slow );
 9435 %}
 9436 
 9437 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9438   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9439   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9440   ins_encode %{
 9441     int vlen_enc = vector_length_encoding(this);
 9442     BasicType bt = Matcher::vector_element_basic_type(this);
 9443     int opc = this->ideal_Opcode();
 9444     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9445                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9446   %}
 9447   ins_pipe( pipe_slow );
 9448 %}
 9449 
 9450 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9451   match(Set dst (OrV (Binary dst src2) mask));
 9452   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9453   ins_encode %{
 9454     int vlen_enc = vector_length_encoding(this);
 9455     BasicType bt = Matcher::vector_element_basic_type(this);
 9456     int opc = this->ideal_Opcode();
 9457     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9458                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9459   %}
 9460   ins_pipe( pipe_slow );
 9461 %}
 9462 
 9463 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9464   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9465   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9466   ins_encode %{
 9467     int vlen_enc = vector_length_encoding(this);
 9468     BasicType bt = Matcher::vector_element_basic_type(this);
 9469     int opc = this->ideal_Opcode();
 9470     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9471                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9472   %}
 9473   ins_pipe( pipe_slow );
 9474 %}
 9475 
 9476 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9477   match(Set dst (AndV (Binary dst src2) mask));
 9478   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9479   ins_encode %{
 9480     int vlen_enc = vector_length_encoding(this);
 9481     BasicType bt = Matcher::vector_element_basic_type(this);
 9482     int opc = this->ideal_Opcode();
 9483     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9484                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9485   %}
 9486   ins_pipe( pipe_slow );
 9487 %}
 9488 
 9489 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9490   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9491   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9492   ins_encode %{
 9493     int vlen_enc = vector_length_encoding(this);
 9494     BasicType bt = Matcher::vector_element_basic_type(this);
 9495     int opc = this->ideal_Opcode();
 9496     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9497                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9498   %}
 9499   ins_pipe( pipe_slow );
 9500 %}
 9501 
 9502 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9503   match(Set dst (SubVB (Binary dst src2) mask));
 9504   match(Set dst (SubVS (Binary dst src2) mask));
 9505   match(Set dst (SubVI (Binary dst src2) mask));
 9506   match(Set dst (SubVL (Binary dst src2) mask));
 9507   match(Set dst (SubVF (Binary dst src2) mask));
 9508   match(Set dst (SubVD (Binary dst src2) mask));
 9509   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9510   ins_encode %{
 9511     int vlen_enc = vector_length_encoding(this);
 9512     BasicType bt = Matcher::vector_element_basic_type(this);
 9513     int opc = this->ideal_Opcode();
 9514     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9515                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9516   %}
 9517   ins_pipe( pipe_slow );
 9518 %}
 9519 
 9520 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9521   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9522   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9523   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9524   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9525   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9526   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9527   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9528   ins_encode %{
 9529     int vlen_enc = vector_length_encoding(this);
 9530     BasicType bt = Matcher::vector_element_basic_type(this);
 9531     int opc = this->ideal_Opcode();
 9532     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9533                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9534   %}
 9535   ins_pipe( pipe_slow );
 9536 %}
 9537 
 9538 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9539   match(Set dst (MulVS (Binary dst src2) mask));
 9540   match(Set dst (MulVI (Binary dst src2) mask));
 9541   match(Set dst (MulVL (Binary dst src2) mask));
 9542   match(Set dst (MulVF (Binary dst src2) mask));
 9543   match(Set dst (MulVD (Binary dst src2) mask));
 9544   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9545   ins_encode %{
 9546     int vlen_enc = vector_length_encoding(this);
 9547     BasicType bt = Matcher::vector_element_basic_type(this);
 9548     int opc = this->ideal_Opcode();
 9549     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9550                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9551   %}
 9552   ins_pipe( pipe_slow );
 9553 %}
 9554 
 9555 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9556   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9557   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9558   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9559   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9560   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9561   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9562   ins_encode %{
 9563     int vlen_enc = vector_length_encoding(this);
 9564     BasicType bt = Matcher::vector_element_basic_type(this);
 9565     int opc = this->ideal_Opcode();
 9566     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9567                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9568   %}
 9569   ins_pipe( pipe_slow );
 9570 %}
 9571 
 9572 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9573   match(Set dst (SqrtVF dst mask));
 9574   match(Set dst (SqrtVD dst mask));
 9575   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9576   ins_encode %{
 9577     int vlen_enc = vector_length_encoding(this);
 9578     BasicType bt = Matcher::vector_element_basic_type(this);
 9579     int opc = this->ideal_Opcode();
 9580     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9581                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9582   %}
 9583   ins_pipe( pipe_slow );
 9584 %}
 9585 
 9586 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9587   match(Set dst (DivVF (Binary dst src2) mask));
 9588   match(Set dst (DivVD (Binary dst src2) mask));
 9589   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9590   ins_encode %{
 9591     int vlen_enc = vector_length_encoding(this);
 9592     BasicType bt = Matcher::vector_element_basic_type(this);
 9593     int opc = this->ideal_Opcode();
 9594     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9595                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9596   %}
 9597   ins_pipe( pipe_slow );
 9598 %}
 9599 
 9600 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9601   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9602   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9603   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9604   ins_encode %{
 9605     int vlen_enc = vector_length_encoding(this);
 9606     BasicType bt = Matcher::vector_element_basic_type(this);
 9607     int opc = this->ideal_Opcode();
 9608     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9609                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9610   %}
 9611   ins_pipe( pipe_slow );
 9612 %}
 9613 
 9614 
 9615 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9616   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9617   match(Set dst (RotateRightV (Binary dst shift) mask));
 9618   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9619   ins_encode %{
 9620     int vlen_enc = vector_length_encoding(this);
 9621     BasicType bt = Matcher::vector_element_basic_type(this);
 9622     int opc = this->ideal_Opcode();
 9623     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9624                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9625   %}
 9626   ins_pipe( pipe_slow );
 9627 %}
 9628 
 9629 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9630   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9631   match(Set dst (RotateRightV (Binary dst src2) mask));
 9632   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9633   ins_encode %{
 9634     int vlen_enc = vector_length_encoding(this);
 9635     BasicType bt = Matcher::vector_element_basic_type(this);
 9636     int opc = this->ideal_Opcode();
 9637     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9638                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9639   %}
 9640   ins_pipe( pipe_slow );
 9641 %}
 9642 
 9643 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9644   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9645   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9646   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9647   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9648   ins_encode %{
 9649     int vlen_enc = vector_length_encoding(this);
 9650     BasicType bt = Matcher::vector_element_basic_type(this);
 9651     int opc = this->ideal_Opcode();
 9652     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9653                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9654   %}
 9655   ins_pipe( pipe_slow );
 9656 %}
 9657 
 9658 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9659   predicate(!n->as_ShiftV()->is_var_shift());
 9660   match(Set dst (LShiftVS (Binary dst src2) mask));
 9661   match(Set dst (LShiftVI (Binary dst src2) mask));
 9662   match(Set dst (LShiftVL (Binary dst src2) mask));
 9663   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9664   ins_encode %{
 9665     int vlen_enc = vector_length_encoding(this);
 9666     BasicType bt = Matcher::vector_element_basic_type(this);
 9667     int opc = this->ideal_Opcode();
 9668     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9669                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9670   %}
 9671   ins_pipe( pipe_slow );
 9672 %}
 9673 
 9674 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9675   predicate(n->as_ShiftV()->is_var_shift());
 9676   match(Set dst (LShiftVS (Binary dst src2) mask));
 9677   match(Set dst (LShiftVI (Binary dst src2) mask));
 9678   match(Set dst (LShiftVL (Binary dst src2) mask));
 9679   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9680   ins_encode %{
 9681     int vlen_enc = vector_length_encoding(this);
 9682     BasicType bt = Matcher::vector_element_basic_type(this);
 9683     int opc = this->ideal_Opcode();
 9684     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9685                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9686   %}
 9687   ins_pipe( pipe_slow );
 9688 %}
 9689 
 9690 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9691   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9692   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9693   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9694   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9695   ins_encode %{
 9696     int vlen_enc = vector_length_encoding(this);
 9697     BasicType bt = Matcher::vector_element_basic_type(this);
 9698     int opc = this->ideal_Opcode();
 9699     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9700                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9701   %}
 9702   ins_pipe( pipe_slow );
 9703 %}
 9704 
 9705 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9706   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9707   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9708   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9709   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9710   ins_encode %{
 9711     int vlen_enc = vector_length_encoding(this);
 9712     BasicType bt = Matcher::vector_element_basic_type(this);
 9713     int opc = this->ideal_Opcode();
 9714     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9715                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9716   %}
 9717   ins_pipe( pipe_slow );
 9718 %}
 9719 
 9720 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9721   predicate(!n->as_ShiftV()->is_var_shift());
 9722   match(Set dst (RShiftVS (Binary dst src2) mask));
 9723   match(Set dst (RShiftVI (Binary dst src2) mask));
 9724   match(Set dst (RShiftVL (Binary dst src2) mask));
 9725   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9726   ins_encode %{
 9727     int vlen_enc = vector_length_encoding(this);
 9728     BasicType bt = Matcher::vector_element_basic_type(this);
 9729     int opc = this->ideal_Opcode();
 9730     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9731                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9732   %}
 9733   ins_pipe( pipe_slow );
 9734 %}
 9735 
 9736 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9737   predicate(n->as_ShiftV()->is_var_shift());
 9738   match(Set dst (RShiftVS (Binary dst src2) mask));
 9739   match(Set dst (RShiftVI (Binary dst src2) mask));
 9740   match(Set dst (RShiftVL (Binary dst src2) mask));
 9741   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9742   ins_encode %{
 9743     int vlen_enc = vector_length_encoding(this);
 9744     BasicType bt = Matcher::vector_element_basic_type(this);
 9745     int opc = this->ideal_Opcode();
 9746     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9747                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9748   %}
 9749   ins_pipe( pipe_slow );
 9750 %}
 9751 
 9752 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9753   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9754   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9755   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9756   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9757   ins_encode %{
 9758     int vlen_enc = vector_length_encoding(this);
 9759     BasicType bt = Matcher::vector_element_basic_type(this);
 9760     int opc = this->ideal_Opcode();
 9761     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9762                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9763   %}
 9764   ins_pipe( pipe_slow );
 9765 %}
 9766 
 9767 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9768   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9769   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9770   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9771   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9772   ins_encode %{
 9773     int vlen_enc = vector_length_encoding(this);
 9774     BasicType bt = Matcher::vector_element_basic_type(this);
 9775     int opc = this->ideal_Opcode();
 9776     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9777                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9778   %}
 9779   ins_pipe( pipe_slow );
 9780 %}
 9781 
 9782 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9783   predicate(!n->as_ShiftV()->is_var_shift());
 9784   match(Set dst (URShiftVS (Binary dst src2) mask));
 9785   match(Set dst (URShiftVI (Binary dst src2) mask));
 9786   match(Set dst (URShiftVL (Binary dst src2) mask));
 9787   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9788   ins_encode %{
 9789     int vlen_enc = vector_length_encoding(this);
 9790     BasicType bt = Matcher::vector_element_basic_type(this);
 9791     int opc = this->ideal_Opcode();
 9792     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9793                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9794   %}
 9795   ins_pipe( pipe_slow );
 9796 %}
 9797 
 9798 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9799   predicate(n->as_ShiftV()->is_var_shift());
 9800   match(Set dst (URShiftVS (Binary dst src2) mask));
 9801   match(Set dst (URShiftVI (Binary dst src2) mask));
 9802   match(Set dst (URShiftVL (Binary dst src2) mask));
 9803   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9804   ins_encode %{
 9805     int vlen_enc = vector_length_encoding(this);
 9806     BasicType bt = Matcher::vector_element_basic_type(this);
 9807     int opc = this->ideal_Opcode();
 9808     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9809                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9810   %}
 9811   ins_pipe( pipe_slow );
 9812 %}
 9813 
 9814 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9815   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9816   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9817   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9818   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9819   ins_encode %{
 9820     int vlen_enc = vector_length_encoding(this);
 9821     BasicType bt = Matcher::vector_element_basic_type(this);
 9822     int opc = this->ideal_Opcode();
 9823     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9824                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9825   %}
 9826   ins_pipe( pipe_slow );
 9827 %}
 9828 
 9829 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9830   match(Set dst (MaxV (Binary dst src2) mask));
 9831   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9832   ins_encode %{
 9833     int vlen_enc = vector_length_encoding(this);
 9834     BasicType bt = Matcher::vector_element_basic_type(this);
 9835     int opc = this->ideal_Opcode();
 9836     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9837                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9838   %}
 9839   ins_pipe( pipe_slow );
 9840 %}
 9841 
 9842 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9843   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9844   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9845   ins_encode %{
 9846     int vlen_enc = vector_length_encoding(this);
 9847     BasicType bt = Matcher::vector_element_basic_type(this);
 9848     int opc = this->ideal_Opcode();
 9849     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9850                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9851   %}
 9852   ins_pipe( pipe_slow );
 9853 %}
 9854 
 9855 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9856   match(Set dst (MinV (Binary dst src2) mask));
 9857   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9858   ins_encode %{
 9859     int vlen_enc = vector_length_encoding(this);
 9860     BasicType bt = Matcher::vector_element_basic_type(this);
 9861     int opc = this->ideal_Opcode();
 9862     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9863                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9864   %}
 9865   ins_pipe( pipe_slow );
 9866 %}
 9867 
 9868 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9869   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9870   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9871   ins_encode %{
 9872     int vlen_enc = vector_length_encoding(this);
 9873     BasicType bt = Matcher::vector_element_basic_type(this);
 9874     int opc = this->ideal_Opcode();
 9875     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9876                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9877   %}
 9878   ins_pipe( pipe_slow );
 9879 %}
 9880 
 9881 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9882   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9883   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9884   ins_encode %{
 9885     int vlen_enc = vector_length_encoding(this);
 9886     BasicType bt = Matcher::vector_element_basic_type(this);
 9887     int opc = this->ideal_Opcode();
 9888     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9889                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9890   %}
 9891   ins_pipe( pipe_slow );
 9892 %}
 9893 
 9894 instruct vabs_masked(vec dst, kReg mask) %{
 9895   match(Set dst (AbsVB dst mask));
 9896   match(Set dst (AbsVS dst mask));
 9897   match(Set dst (AbsVI dst mask));
 9898   match(Set dst (AbsVL dst mask));
 9899   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9900   ins_encode %{
 9901     int vlen_enc = vector_length_encoding(this);
 9902     BasicType bt = Matcher::vector_element_basic_type(this);
 9903     int opc = this->ideal_Opcode();
 9904     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9905                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9906   %}
 9907   ins_pipe( pipe_slow );
 9908 %}
 9909 
 9910 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9911   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9912   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9913   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9914   ins_encode %{
 9915     assert(UseFMA, "Needs FMA instructions support.");
 9916     int vlen_enc = vector_length_encoding(this);
 9917     BasicType bt = Matcher::vector_element_basic_type(this);
 9918     int opc = this->ideal_Opcode();
 9919     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9920                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9921   %}
 9922   ins_pipe( pipe_slow );
 9923 %}
 9924 
 9925 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9926   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9927   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9928   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9929   ins_encode %{
 9930     assert(UseFMA, "Needs FMA instructions support.");
 9931     int vlen_enc = vector_length_encoding(this);
 9932     BasicType bt = Matcher::vector_element_basic_type(this);
 9933     int opc = this->ideal_Opcode();
 9934     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9935                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9936   %}
 9937   ins_pipe( pipe_slow );
 9938 %}
 9939 
 9940 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
 9941   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
 9942   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
 9943   ins_encode %{
 9944     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 9945     int vlen_enc = vector_length_encoding(this, $src1);
 9946     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9947 
 9948     // Comparison i
 9949     switch (src1_elem_bt) {
 9950       case T_BYTE: {
 9951         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9952         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9953         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9954         break;
 9955       }
 9956       case T_SHORT: {
 9957         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9958         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9959         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9960         break;
 9961       }
 9962       case T_INT: {
 9963         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9964         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9965         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9966         break;
 9967       }
 9968       case T_LONG: {
 9969         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9970         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9971         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9972         break;
 9973       }
 9974       case T_FLOAT: {
 9975         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9976         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9977         break;
 9978       }
 9979       case T_DOUBLE: {
 9980         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9981         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9982         break;
 9983       }
 9984       default: assert(false, "%s", type2name(src1_elem_bt)); break;
 9985     }
 9986   %}
 9987   ins_pipe( pipe_slow );
 9988 %}
 9989 
 9990 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
 9991   predicate(Matcher::vector_length(n) <= 32);
 9992   match(Set dst (MaskAll src));
 9993   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
 9994   ins_encode %{
 9995     int mask_len = Matcher::vector_length(this);
 9996     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
 9997   %}
 9998   ins_pipe( pipe_slow );
 9999 %}
10000 
10001 #ifdef _LP64
10002 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10003   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10004   match(Set dst (XorVMask src (MaskAll cnt)));
10005   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10006   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10007   ins_encode %{
10008     uint masklen = Matcher::vector_length(this);
10009     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10010   %}
10011   ins_pipe( pipe_slow );
10012 %}
10013 
10014 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10015   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10016             (Matcher::vector_length(n) == 16) ||
10017             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10018   match(Set dst (XorVMask src (MaskAll cnt)));
10019   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10020   ins_encode %{
10021     uint masklen = Matcher::vector_length(this);
10022     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10023   %}
10024   ins_pipe( pipe_slow );
10025 %}
10026 
10027 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10028   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10029   match(Set dst (VectorLongToMask src));
10030   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10031   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10032   ins_encode %{
10033     int mask_len = Matcher::vector_length(this);
10034     int vec_enc  = vector_length_encoding(mask_len);
10035     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10036                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10037   %}
10038   ins_pipe( pipe_slow );
10039 %}
10040 
10041 
10042 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10043   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10044   match(Set dst (VectorLongToMask src));
10045   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10046   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10047   ins_encode %{
10048     int mask_len = Matcher::vector_length(this);
10049     assert(mask_len <= 32, "invalid mask length");
10050     int vec_enc  = vector_length_encoding(mask_len);
10051     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10052                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10053   %}
10054   ins_pipe( pipe_slow );
10055 %}
10056 
10057 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10058   predicate(n->bottom_type()->isa_vectmask());
10059   match(Set dst (VectorLongToMask src));
10060   format %{ "long_to_mask_evex $dst, $src\t!" %}
10061   ins_encode %{
10062     __ kmov($dst$$KRegister, $src$$Register);
10063   %}
10064   ins_pipe( pipe_slow );
10065 %}
10066 #endif
10067 
10068 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10069   match(Set dst (AndVMask src1 src2));
10070   match(Set dst (OrVMask src1 src2));
10071   match(Set dst (XorVMask src1 src2));
10072   effect(TEMP kscratch);
10073   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10074   ins_encode %{
10075     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10076     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10077     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10078     uint masklen = Matcher::vector_length(this);
10079     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10080     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10081   %}
10082   ins_pipe( pipe_slow );
10083 %}
10084 
10085 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10086   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10087   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10088   ins_encode %{
10089     int vlen_enc = vector_length_encoding(this);
10090     BasicType bt = Matcher::vector_element_basic_type(this);
10091     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10092                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10093   %}
10094   ins_pipe( pipe_slow );
10095 %}
10096 
10097 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10098   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10099   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10100   ins_encode %{
10101     int vlen_enc = vector_length_encoding(this);
10102     BasicType bt = Matcher::vector_element_basic_type(this);
10103     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10104                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10105   %}
10106   ins_pipe( pipe_slow );
10107 %}
10108 
10109 instruct castMM(kReg dst)
10110 %{
10111   match(Set dst (CastVV dst));
10112 
10113   size(0);
10114   format %{ "# castVV of $dst" %}
10115   ins_encode(/* empty encoding */);
10116   ins_cost(0);
10117   ins_pipe(empty);
10118 %}
10119 
10120 instruct castVV(vec dst)
10121 %{
10122   match(Set dst (CastVV dst));
10123 
10124   size(0);
10125   format %{ "# castVV of $dst" %}
10126   ins_encode(/* empty encoding */);
10127   ins_cost(0);
10128   ins_pipe(empty);
10129 %}
10130 
10131 instruct castVVLeg(legVec dst)
10132 %{
10133   match(Set dst (CastVV dst));
10134 
10135   size(0);
10136   format %{ "# castVV of $dst" %}
10137   ins_encode(/* empty encoding */);
10138   ins_cost(0);
10139   ins_pipe(empty);
10140 %}
10141 
10142 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10143 %{
10144   match(Set dst (IsInfiniteF src));
10145   effect(TEMP ktmp, KILL cr);
10146   format %{ "float_class_check $dst, $src" %}
10147   ins_encode %{
10148     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10149     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10150   %}
10151   ins_pipe(pipe_slow);
10152 %}
10153 
10154 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10155 %{
10156   match(Set dst (IsInfiniteD src));
10157   effect(TEMP ktmp, KILL cr);
10158   format %{ "double_class_check $dst, $src" %}
10159   ins_encode %{
10160     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10161     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10162   %}
10163   ins_pipe(pipe_slow);
10164 %}
10165 
10166