1 //
    2 // Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM31 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_unsigned_booltest_pred(int bt) {
 1250   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
 1251 }
 1252 
 1253 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1254   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1255            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1256 }
 1257 
 1258 class Node::PD {
 1259 public:
 1260   enum NodeFlags {
 1261     Flag_intel_jcc_erratum = Node::_last_flag << 1,
 1262     _last_flag             = Flag_intel_jcc_erratum
 1263   };
 1264 };
 1265 
 1266 %} // end source_hpp
 1267 
 1268 source %{
 1269 
 1270 #include "opto/addnode.hpp"
 1271 #include "c2_intelJccErratum_x86.hpp"
 1272 
 1273 void PhaseOutput::pd_perform_mach_node_analysis() {
 1274   if (VM_Version::has_intel_jcc_erratum()) {
 1275     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1276     _buf_sizes._code += extra_padding;
 1277   }
 1278 }
 1279 
 1280 int MachNode::pd_alignment_required() const {
 1281   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1282     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1283     return IntelJccErratum::largest_jcc_size() + 1;
 1284   } else {
 1285     return 1;
 1286   }
 1287 }
 1288 
 1289 int MachNode::compute_padding(int current_offset) const {
 1290   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1291     Compile* C = Compile::current();
 1292     PhaseOutput* output = C->output();
 1293     Block* block = output->block();
 1294     int index = output->index();
 1295     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1296   } else {
 1297     return 0;
 1298   }
 1299 }
 1300 
 1301 // Emit exception handler code.
 1302 // Stuff framesize into a register and call a VM stub routine.
 1303 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1304 
 1305   // Note that the code buffer's insts_mark is always relative to insts.
 1306   // That's why we must use the macroassembler to generate a handler.
 1307   C2_MacroAssembler _masm(&cbuf);
 1308   address base = __ start_a_stub(size_exception_handler());
 1309   if (base == NULL) {
 1310     ciEnv::current()->record_failure("CodeCache is full");
 1311     return 0;  // CodeBuffer::expand failed
 1312   }
 1313   int offset = __ offset();
 1314   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1315   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1316   __ end_a_stub();
 1317   return offset;
 1318 }
 1319 
 1320 // Emit deopt handler code.
 1321 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1322 
 1323   // Note that the code buffer's insts_mark is always relative to insts.
 1324   // That's why we must use the macroassembler to generate a handler.
 1325   C2_MacroAssembler _masm(&cbuf);
 1326   address base = __ start_a_stub(size_deopt_handler());
 1327   if (base == NULL) {
 1328     ciEnv::current()->record_failure("CodeCache is full");
 1329     return 0;  // CodeBuffer::expand failed
 1330   }
 1331   int offset = __ offset();
 1332 
 1333 #ifdef _LP64
 1334   address the_pc = (address) __ pc();
 1335   Label next;
 1336   // push a "the_pc" on the stack without destroying any registers
 1337   // as they all may be live.
 1338 
 1339   // push address of "next"
 1340   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1341   __ bind(next);
 1342   // adjust it so it matches "the_pc"
 1343   __ subptr(Address(rsp, 0), __ offset() - offset);
 1344 #else
 1345   InternalAddress here(__ pc());
 1346   __ pushptr(here.addr(), noreg);
 1347 #endif
 1348 
 1349   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1350   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1351   __ end_a_stub();
 1352   return offset;
 1353 }
 1354 
 1355 Assembler::Width widthForType(BasicType bt) {
 1356   if (bt == T_BYTE) {
 1357     return Assembler::B;
 1358   } else if (bt == T_SHORT) {
 1359     return Assembler::W;
 1360   } else if (bt == T_INT) {
 1361     return Assembler::D;
 1362   } else {
 1363     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1364     return Assembler::Q;
 1365   }
 1366 }
 1367 
 1368 //=============================================================================
 1369 
 1370   // Float masks come from different places depending on platform.
 1371 #ifdef _LP64
 1372   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1373   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1374   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1375   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1376 #else
 1377   static address float_signmask()  { return (address)float_signmask_pool; }
 1378   static address float_signflip()  { return (address)float_signflip_pool; }
 1379   static address double_signmask() { return (address)double_signmask_pool; }
 1380   static address double_signflip() { return (address)double_signflip_pool; }
 1381 #endif
 1382   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1383   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1384   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1385   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1386   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1387   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1388   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1389   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1390   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1391   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1392   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1393   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1394   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1395   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1396   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1397 
 1398 //=============================================================================
 1399 const bool Matcher::match_rule_supported(int opcode) {
 1400   if (!has_match_rule(opcode)) {
 1401     return false; // no match rule present
 1402   }
 1403   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1404   switch (opcode) {
 1405     case Op_AbsVL:
 1406     case Op_StoreVectorScatter:
 1407       if (UseAVX < 3) {
 1408         return false;
 1409       }
 1410       break;
 1411     case Op_PopCountI:
 1412     case Op_PopCountL:
 1413       if (!UsePopCountInstruction) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountVI:
 1418       if (UseAVX < 2) {
 1419         return false;
 1420       }
 1421       break;
 1422     case Op_PopCountVL:
 1423       if (UseAVX < 2) {
 1424         return false;
 1425       }
 1426       break;
 1427     case Op_MulVI:
 1428       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1429         return false;
 1430       }
 1431       break;
 1432     case Op_MulVL:
 1433       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1434         return false;
 1435       }
 1436       break;
 1437     case Op_MulReductionVL:
 1438       if (VM_Version::supports_avx512dq() == false) {
 1439         return false;
 1440       }
 1441       break;
 1442     case Op_AddReductionVL:
 1443       if (UseSSE < 2) { // requires at least SSE2
 1444         return false;
 1445       }
 1446       break;
 1447     case Op_AbsVB:
 1448     case Op_AbsVS:
 1449     case Op_AbsVI:
 1450     case Op_AddReductionVI:
 1451     case Op_AndReductionV:
 1452     case Op_OrReductionV:
 1453     case Op_XorReductionV:
 1454       if (UseSSE < 3) { // requires at least SSSE3
 1455         return false;
 1456       }
 1457       break;
 1458     case Op_VectorLoadShuffle:
 1459     case Op_VectorRearrange:
 1460     case Op_MulReductionVI:
 1461       if (UseSSE < 4) { // requires at least SSE4
 1462         return false;
 1463       }
 1464       break;
 1465     case Op_IsInfiniteF:
 1466     case Op_IsInfiniteD:
 1467       if (!VM_Version::supports_avx512dq()) {
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_SqrtVD:
 1472     case Op_SqrtVF:
 1473     case Op_VectorMaskCmp:
 1474     case Op_VectorCastB2X:
 1475     case Op_VectorCastS2X:
 1476     case Op_VectorCastI2X:
 1477     case Op_VectorCastL2X:
 1478     case Op_VectorCastF2X:
 1479     case Op_VectorCastD2X:
 1480     case Op_VectorUCastB2X:
 1481     case Op_VectorUCastS2X:
 1482     case Op_VectorUCastI2X:
 1483     case Op_VectorMaskCast:
 1484       if (UseAVX < 1) { // enabled for AVX only
 1485         return false;
 1486       }
 1487       break;
 1488     case Op_PopulateIndex:
 1489       if (!is_LP64 || (UseAVX < 2)) {
 1490         return false;
 1491       }
 1492       break;
 1493     case Op_RoundVF:
 1494       if (UseAVX < 2) { // enabled for AVX2 only
 1495         return false;
 1496       }
 1497       break;
 1498     case Op_RoundVD:
 1499       if (UseAVX < 3) {
 1500         return false;  // enabled for AVX3 only
 1501       }
 1502       break;
 1503     case Op_CompareAndSwapL:
 1504 #ifdef _LP64
 1505     case Op_CompareAndSwapP:
 1506 #endif
 1507       if (!VM_Version::supports_cx8()) {
 1508         return false;
 1509       }
 1510       break;
 1511     case Op_CMoveVF:
 1512     case Op_CMoveVD:
 1513       if (UseAVX < 1) { // enabled for AVX only
 1514         return false;
 1515       }
 1516       break;
 1517     case Op_StrIndexOf:
 1518       if (!UseSSE42Intrinsics) {
 1519         return false;
 1520       }
 1521       break;
 1522     case Op_StrIndexOfChar:
 1523       if (!UseSSE42Intrinsics) {
 1524         return false;
 1525       }
 1526       break;
 1527     case Op_OnSpinWait:
 1528       if (VM_Version::supports_on_spin_wait() == false) {
 1529         return false;
 1530       }
 1531       break;
 1532     case Op_MulVB:
 1533     case Op_LShiftVB:
 1534     case Op_RShiftVB:
 1535     case Op_URShiftVB:
 1536     case Op_VectorInsert:
 1537     case Op_VectorLoadMask:
 1538     case Op_VectorStoreMask:
 1539     case Op_VectorBlend:
 1540       if (UseSSE < 4) {
 1541         return false;
 1542       }
 1543       break;
 1544 #ifdef _LP64
 1545     case Op_MaxD:
 1546     case Op_MaxF:
 1547     case Op_MinD:
 1548     case Op_MinF:
 1549       if (UseAVX < 1) { // enabled for AVX only
 1550         return false;
 1551       }
 1552       break;
 1553 #endif
 1554     case Op_CacheWB:
 1555     case Op_CacheWBPreSync:
 1556     case Op_CacheWBPostSync:
 1557       if (!VM_Version::supports_data_cache_line_flush()) {
 1558         return false;
 1559       }
 1560       break;
 1561     case Op_ExtractB:
 1562     case Op_ExtractL:
 1563     case Op_ExtractI:
 1564     case Op_RoundDoubleMode:
 1565       if (UseSSE < 4) {
 1566         return false;
 1567       }
 1568       break;
 1569     case Op_RoundDoubleModeV:
 1570       if (VM_Version::supports_avx() == false) {
 1571         return false; // 128bit vroundpd is not available
 1572       }
 1573       break;
 1574     case Op_LoadVectorGather:
 1575       if (UseAVX < 2) {
 1576         return false;
 1577       }
 1578       break;
 1579     case Op_FmaVD:
 1580     case Op_FmaVF:
 1581       if (!UseFMA) {
 1582         return false;
 1583       }
 1584       break;
 1585     case Op_MacroLogicV:
 1586       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1587         return false;
 1588       }
 1589       break;
 1590 
 1591     case Op_VectorCmpMasked:
 1592     case Op_VectorMaskGen:
 1593       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1594         return false;
 1595       }
 1596       break;
 1597     case Op_VectorMaskFirstTrue:
 1598     case Op_VectorMaskLastTrue:
 1599     case Op_VectorMaskTrueCount:
 1600     case Op_VectorMaskToLong:
 1601       if (!is_LP64 || UseAVX < 1) {
 1602          return false;
 1603       }
 1604       break;
 1605     case Op_RoundF:
 1606     case Op_RoundD:
 1607       if (!is_LP64) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_CopySignD:
 1612     case Op_CopySignF:
 1613       if (UseAVX < 3 || !is_LP64)  {
 1614         return false;
 1615       }
 1616       if (!VM_Version::supports_avx512vl()) {
 1617         return false;
 1618       }
 1619       break;
 1620 #ifndef _LP64
 1621     case Op_AddReductionVF:
 1622     case Op_AddReductionVD:
 1623     case Op_MulReductionVF:
 1624     case Op_MulReductionVD:
 1625       if (UseSSE < 1) { // requires at least SSE
 1626         return false;
 1627       }
 1628       break;
 1629     case Op_MulAddVS2VI:
 1630     case Op_RShiftVL:
 1631     case Op_AbsVD:
 1632     case Op_NegVD:
 1633       if (UseSSE < 2) {
 1634         return false;
 1635       }
 1636       break;
 1637 #endif // !LP64
 1638     case Op_CompressBits:
 1639       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_ExpandBits:
 1644       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1645         return false;
 1646       }
 1647       break;
 1648     case Op_SignumF:
 1649       if (UseSSE < 1) {
 1650         return false;
 1651       }
 1652       break;
 1653     case Op_SignumD:
 1654       if (UseSSE < 2) {
 1655         return false;
 1656       }
 1657       break;
 1658     case Op_CompressM:
 1659       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1660         return false;
 1661       }
 1662       break;
 1663     case Op_CompressV:
 1664     case Op_ExpandV:
 1665       if (!VM_Version::supports_avx512vl()) {
 1666         return false;
 1667       }
 1668       break;
 1669     case Op_SqrtF:
 1670       if (UseSSE < 1) {
 1671         return false;
 1672       }
 1673       break;
 1674     case Op_SqrtD:
 1675 #ifdef _LP64
 1676       if (UseSSE < 2) {
 1677         return false;
 1678       }
 1679 #else
 1680       // x86_32.ad has a special match rule for SqrtD.
 1681       // Together with common x86 rules, this handles all UseSSE cases.
 1682 #endif
 1683       break;
 1684     case Op_ConvF2HF:
 1685     case Op_ConvHF2F:
 1686       if (!VM_Version::supports_float16()) {
 1687         return false;
 1688       }
 1689       break;
 1690     case Op_VectorCastF2HF:
 1691     case Op_VectorCastHF2F:
 1692       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1693         return false;
 1694       }
 1695       break;
 1696   }
 1697   return true;  // Match rules are supported by default.
 1698 }
 1699 
 1700 //------------------------------------------------------------------------
 1701 
 1702 static inline bool is_pop_count_instr_target(BasicType bt) {
 1703   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1704          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1705 }
 1706 
 1707 const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) {
 1708   return match_rule_supported_vector(opcode, vlen, bt);
 1709 }
 1710 
 1711 // Identify extra cases that we might want to provide match rules for vector nodes and
 1712 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1713 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1714   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1715   if (!match_rule_supported(opcode)) {
 1716     return false;
 1717   }
 1718   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1719   //   * SSE2 supports 128bit vectors for all types;
 1720   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1721   //   * AVX2 supports 256bit vectors for all types;
 1722   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1723   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1724   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1725   // And MaxVectorSize is taken into account as well.
 1726   if (!vector_size_supported(bt, vlen)) {
 1727     return false;
 1728   }
 1729   // Special cases which require vector length follow:
 1730   //   * implementation limitations
 1731   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1732   //   * 128bit vroundpd instruction is present only in AVX1
 1733   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1734   switch (opcode) {
 1735     case Op_AbsVF:
 1736     case Op_NegVF:
 1737       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1738         return false; // 512bit vandps and vxorps are not available
 1739       }
 1740       break;
 1741     case Op_AbsVD:
 1742     case Op_NegVD:
 1743       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1744         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1745       }
 1746       break;
 1747     case Op_CMoveVF:
 1748       if (vlen != 8) {
 1749         return false; // implementation limitation (only vcmov8F_reg is present)
 1750       }
 1751       break;
 1752     case Op_RotateRightV:
 1753     case Op_RotateLeftV:
 1754       if (bt != T_INT && bt != T_LONG) {
 1755         return false;
 1756       } // fallthrough
 1757     case Op_MacroLogicV:
 1758       if (!VM_Version::supports_evex() ||
 1759           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1760         return false;
 1761       }
 1762       break;
 1763     case Op_ClearArray:
 1764     case Op_VectorMaskGen:
 1765     case Op_VectorCmpMasked:
 1766       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1767         return false;
 1768       }
 1769       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1770         return false;
 1771       }
 1772       break;
 1773     case Op_LoadVectorMasked:
 1774     case Op_StoreVectorMasked:
 1775       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_CMoveVD:
 1780       if (vlen != 4) {
 1781         return false; // implementation limitation (only vcmov4D_reg is present)
 1782       }
 1783       break;
 1784     case Op_MaxV:
 1785     case Op_MinV:
 1786       if (UseSSE < 4 && is_integral_type(bt)) {
 1787         return false;
 1788       }
 1789       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1790           // Float/Double intrinsics are enabled for AVX family currently.
 1791           if (UseAVX == 0) {
 1792             return false;
 1793           }
 1794           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1795             return false;
 1796           }
 1797       }
 1798       break;
 1799     case Op_CallLeafVector:
 1800       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1801         return false;
 1802       }
 1803       break;
 1804     case Op_AddReductionVI:
 1805       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1806         return false;
 1807       }
 1808       // fallthrough
 1809     case Op_AndReductionV:
 1810     case Op_OrReductionV:
 1811     case Op_XorReductionV:
 1812       if (is_subword_type(bt) && (UseSSE < 4)) {
 1813         return false;
 1814       }
 1815 #ifndef _LP64
 1816       if (bt == T_BYTE || bt == T_LONG) {
 1817         return false;
 1818       }
 1819 #endif
 1820       break;
 1821 #ifndef _LP64
 1822     case Op_VectorInsert:
 1823       if (bt == T_LONG || bt == T_DOUBLE) {
 1824         return false;
 1825       }
 1826       break;
 1827 #endif
 1828     case Op_MinReductionV:
 1829     case Op_MaxReductionV:
 1830       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1831         return false;
 1832       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1833         return false;
 1834       }
 1835       // Float/Double intrinsics enabled for AVX family.
 1836       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1837         return false;
 1838       }
 1839       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1840         return false;
 1841       }
 1842 #ifndef _LP64
 1843       if (bt == T_BYTE || bt == T_LONG) {
 1844         return false;
 1845       }
 1846 #endif
 1847       break;
 1848     case Op_VectorTest:
 1849       if (UseSSE < 4) {
 1850         return false; // Implementation limitation
 1851       } else if (size_in_bits < 32) {
 1852         return false; // Implementation limitation
 1853       }
 1854       break;
 1855     case Op_VectorLoadShuffle:
 1856     case Op_VectorRearrange:
 1857       if(vlen == 2) {
 1858         return false; // Implementation limitation due to how shuffle is loaded
 1859       } else if (size_in_bits == 256 && UseAVX < 2) {
 1860         return false; // Implementation limitation
 1861       }
 1862       break;
 1863     case Op_VectorLoadMask:
 1864     case Op_VectorMaskCast:
 1865       if (size_in_bits == 256 && UseAVX < 2) {
 1866         return false; // Implementation limitation
 1867       }
 1868       // fallthrough
 1869     case Op_VectorStoreMask:
 1870       if (vlen == 2) {
 1871         return false; // Implementation limitation
 1872       }
 1873       break;
 1874     case Op_PopulateIndex:
 1875       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1876         return false;
 1877       }
 1878       break;
 1879     case Op_VectorCastB2X:
 1880     case Op_VectorCastS2X:
 1881     case Op_VectorCastI2X:
 1882       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1883         return false;
 1884       }
 1885       break;
 1886     case Op_VectorCastL2X:
 1887       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1888         return false;
 1889       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1890         return false;
 1891       }
 1892       break;
 1893     case Op_VectorCastF2X: {
 1894         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1895         // happen after intermediate conversion to integer and special handling
 1896         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1897         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1898         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1899           return false;
 1900         }
 1901       }
 1902       // fallthrough
 1903     case Op_VectorCastD2X:
 1904       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1905         return false;
 1906       }
 1907       break;
 1908     case Op_VectorCastF2HF:
 1909     case Op_VectorCastHF2F:
 1910       if (!VM_Version::supports_f16c() &&
 1911          ((!VM_Version::supports_evex() ||
 1912          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1913         return false;
 1914       }
 1915       break;
 1916     case Op_RoundVD:
 1917       if (!VM_Version::supports_avx512dq()) {
 1918         return false;
 1919       }
 1920       break;
 1921     case Op_MulReductionVI:
 1922       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1923         return false;
 1924       }
 1925       break;
 1926     case Op_LoadVectorGatherMasked:
 1927     case Op_StoreVectorScatterMasked:
 1928     case Op_StoreVectorScatter:
 1929       if (is_subword_type(bt)) {
 1930         return false;
 1931       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1932         return false;
 1933       }
 1934       // fallthrough
 1935     case Op_LoadVectorGather:
 1936       if (size_in_bits == 64 ) {
 1937         return false;
 1938       }
 1939       break;
 1940     case Op_MaskAll:
 1941       if (!VM_Version::supports_evex()) {
 1942         return false;
 1943       }
 1944       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1945         return false;
 1946       }
 1947       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1948         return false;
 1949       }
 1950       break;
 1951     case Op_VectorMaskCmp:
 1952       if (vlen < 2 || size_in_bits < 32) {
 1953         return false;
 1954       }
 1955       break;
 1956     case Op_CompressM:
 1957       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1958         return false;
 1959       }
 1960       break;
 1961     case Op_CompressV:
 1962     case Op_ExpandV:
 1963       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1964         return false;
 1965       }
 1966       if (size_in_bits < 128 ) {
 1967         return false;
 1968       }
 1969       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1970         return false;
 1971       }
 1972       break;
 1973     case Op_VectorLongToMask:
 1974       if (UseAVX < 1 || !is_LP64) {
 1975         return false;
 1976       }
 1977       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1978         return false;
 1979       }
 1980       break;
 1981     case Op_SignumVD:
 1982     case Op_SignumVF:
 1983       if (UseAVX < 1) {
 1984         return false;
 1985       }
 1986       break;
 1987     case Op_PopCountVI:
 1988     case Op_PopCountVL: {
 1989         if (!is_pop_count_instr_target(bt) &&
 1990             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1991           return false;
 1992         }
 1993       }
 1994       break;
 1995     case Op_ReverseV:
 1996     case Op_ReverseBytesV:
 1997       if (UseAVX < 2) {
 1998         return false;
 1999       }
 2000       break;
 2001     case Op_CountTrailingZerosV:
 2002     case Op_CountLeadingZerosV:
 2003       if (UseAVX < 2) {
 2004         return false;
 2005       }
 2006       break;
 2007   }
 2008   return true;  // Per default match rules are supported.
 2009 }
 2010 
 2011 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2012   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2013   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2014   // of their non-masked counterpart with mask edge being the differentiator.
 2015   // This routine does a strict check on the existence of masked operation patterns
 2016   // by returning a default false value for all the other opcodes apart from the
 2017   // ones whose masked instruction patterns are defined in this file.
 2018   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2019     return false;
 2020   }
 2021 
 2022   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2023   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2024   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2025     return false;
 2026   }
 2027   switch(opcode) {
 2028     // Unary masked operations
 2029     case Op_AbsVB:
 2030     case Op_AbsVS:
 2031       if(!VM_Version::supports_avx512bw()) {
 2032         return false;  // Implementation limitation
 2033       }
 2034     case Op_AbsVI:
 2035     case Op_AbsVL:
 2036       return true;
 2037 
 2038     // Ternary masked operations
 2039     case Op_FmaVF:
 2040     case Op_FmaVD:
 2041       return true;
 2042 
 2043     case Op_MacroLogicV:
 2044       if(bt != T_INT && bt != T_LONG) {
 2045         return false;
 2046       }
 2047       return true;
 2048 
 2049     // Binary masked operations
 2050     case Op_AddVB:
 2051     case Op_AddVS:
 2052     case Op_SubVB:
 2053     case Op_SubVS:
 2054     case Op_MulVS:
 2055     case Op_LShiftVS:
 2056     case Op_RShiftVS:
 2057     case Op_URShiftVS:
 2058       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2059       if (!VM_Version::supports_avx512bw()) {
 2060         return false;  // Implementation limitation
 2061       }
 2062       return true;
 2063 
 2064     case Op_MulVL:
 2065       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2066       if (!VM_Version::supports_avx512dq()) {
 2067         return false;  // Implementation limitation
 2068       }
 2069       return true;
 2070 
 2071     case Op_AndV:
 2072     case Op_OrV:
 2073     case Op_XorV:
 2074     case Op_RotateRightV:
 2075     case Op_RotateLeftV:
 2076       if (bt != T_INT && bt != T_LONG) {
 2077         return false; // Implementation limitation
 2078       }
 2079       return true;
 2080 
 2081     case Op_VectorLoadMask:
 2082       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2083       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2084         return false;
 2085       }
 2086       return true;
 2087 
 2088     case Op_AddVI:
 2089     case Op_AddVL:
 2090     case Op_AddVF:
 2091     case Op_AddVD:
 2092     case Op_SubVI:
 2093     case Op_SubVL:
 2094     case Op_SubVF:
 2095     case Op_SubVD:
 2096     case Op_MulVI:
 2097     case Op_MulVF:
 2098     case Op_MulVD:
 2099     case Op_DivVF:
 2100     case Op_DivVD:
 2101     case Op_SqrtVF:
 2102     case Op_SqrtVD:
 2103     case Op_LShiftVI:
 2104     case Op_LShiftVL:
 2105     case Op_RShiftVI:
 2106     case Op_RShiftVL:
 2107     case Op_URShiftVI:
 2108     case Op_URShiftVL:
 2109     case Op_LoadVectorMasked:
 2110     case Op_StoreVectorMasked:
 2111     case Op_LoadVectorGatherMasked:
 2112     case Op_StoreVectorScatterMasked:
 2113       return true;
 2114 
 2115     case Op_MaxV:
 2116     case Op_MinV:
 2117       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2118         return false; // Implementation limitation
 2119       }
 2120       if (is_floating_point_type(bt)) {
 2121         return false; // Implementation limitation
 2122       }
 2123       return true;
 2124 
 2125     case Op_VectorMaskCmp:
 2126       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2127         return false; // Implementation limitation
 2128       }
 2129       return true;
 2130 
 2131     case Op_VectorRearrange:
 2132       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2133         return false; // Implementation limitation
 2134       }
 2135       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2136         return false; // Implementation limitation
 2137       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2138         return false; // Implementation limitation
 2139       }
 2140       return true;
 2141 
 2142     // Binary Logical operations
 2143     case Op_AndVMask:
 2144     case Op_OrVMask:
 2145     case Op_XorVMask:
 2146       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2147         return false; // Implementation limitation
 2148       }
 2149       return true;
 2150 
 2151     case Op_PopCountVI:
 2152     case Op_PopCountVL:
 2153       if (!is_pop_count_instr_target(bt)) {
 2154         return false;
 2155       }
 2156       return true;
 2157 
 2158     case Op_MaskAll:
 2159       return true;
 2160 
 2161     case Op_CountLeadingZerosV:
 2162       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2163         return true;
 2164       }
 2165     default:
 2166       return false;
 2167   }
 2168 }
 2169 
 2170 const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2171   return false;
 2172 }
 2173 
 2174 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2175 const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
 2176   switch (elem_bt) {
 2177     case T_BYTE:  return false;
 2178     case T_SHORT: return !VM_Version::supports_avx512bw();
 2179     case T_INT:   return !VM_Version::supports_avx();
 2180     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2181     default:
 2182       ShouldNotReachHere();
 2183       return false;
 2184   }
 2185 }
 2186 
 2187 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2188   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2189   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2190   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2191       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2192     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2193     return new legVecZOper();
 2194   }
 2195   if (legacy) {
 2196     switch (ideal_reg) {
 2197       case Op_VecS: return new legVecSOper();
 2198       case Op_VecD: return new legVecDOper();
 2199       case Op_VecX: return new legVecXOper();
 2200       case Op_VecY: return new legVecYOper();
 2201       case Op_VecZ: return new legVecZOper();
 2202     }
 2203   } else {
 2204     switch (ideal_reg) {
 2205       case Op_VecS: return new vecSOper();
 2206       case Op_VecD: return new vecDOper();
 2207       case Op_VecX: return new vecXOper();
 2208       case Op_VecY: return new vecYOper();
 2209       case Op_VecZ: return new vecZOper();
 2210     }
 2211   }
 2212   ShouldNotReachHere();
 2213   return NULL;
 2214 }
 2215 
 2216 bool Matcher::is_reg2reg_move(MachNode* m) {
 2217   switch (m->rule()) {
 2218     case MoveVec2Leg_rule:
 2219     case MoveLeg2Vec_rule:
 2220     case MoveF2VL_rule:
 2221     case MoveF2LEG_rule:
 2222     case MoveVL2F_rule:
 2223     case MoveLEG2F_rule:
 2224     case MoveD2VL_rule:
 2225     case MoveD2LEG_rule:
 2226     case MoveVL2D_rule:
 2227     case MoveLEG2D_rule:
 2228       return true;
 2229     default:
 2230       return false;
 2231   }
 2232 }
 2233 
 2234 bool Matcher::is_generic_vector(MachOper* opnd) {
 2235   switch (opnd->opcode()) {
 2236     case VEC:
 2237     case LEGVEC:
 2238       return true;
 2239     default:
 2240       return false;
 2241   }
 2242 }
 2243 
 2244 //------------------------------------------------------------------------
 2245 
 2246 const RegMask* Matcher::predicate_reg_mask(void) {
 2247   return &_VECTMASK_REG_mask;
 2248 }
 2249 
 2250 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2251   return new TypeVectMask(elemTy, length);
 2252 }
 2253 
 2254 // Max vector size in bytes. 0 if not supported.
 2255 const int Matcher::vector_width_in_bytes(BasicType bt) {
 2256   assert(is_java_primitive(bt), "only primitive type vectors");
 2257   if (UseSSE < 2) return 0;
 2258   // SSE2 supports 128bit vectors for all types.
 2259   // AVX2 supports 256bit vectors for all types.
 2260   // AVX2/EVEX supports 512bit vectors for all types.
 2261   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2262   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2263   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2264     size = (UseAVX > 2) ? 64 : 32;
 2265   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2266     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2267   // Use flag to limit vector size.
 2268   size = MIN2(size,(int)MaxVectorSize);
 2269   // Minimum 2 values in vector (or 4 for bytes).
 2270   switch (bt) {
 2271   case T_DOUBLE:
 2272   case T_LONG:
 2273     if (size < 16) return 0;
 2274     break;
 2275   case T_FLOAT:
 2276   case T_INT:
 2277     if (size < 8) return 0;
 2278     break;
 2279   case T_BOOLEAN:
 2280     if (size < 4) return 0;
 2281     break;
 2282   case T_CHAR:
 2283     if (size < 4) return 0;
 2284     break;
 2285   case T_BYTE:
 2286     if (size < 4) return 0;
 2287     break;
 2288   case T_SHORT:
 2289     if (size < 4) return 0;
 2290     break;
 2291   default:
 2292     ShouldNotReachHere();
 2293   }
 2294   return size;
 2295 }
 2296 
 2297 // Limits on vector size (number of elements) loaded into vector.
 2298 const int Matcher::max_vector_size(const BasicType bt) {
 2299   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2300 }
 2301 const int Matcher::min_vector_size(const BasicType bt) {
 2302   int max_size = max_vector_size(bt);
 2303   // Min size which can be loaded into vector is 4 bytes.
 2304   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2305   // Support for calling svml double64 vectors
 2306   if (bt == T_DOUBLE) {
 2307     size = 1;
 2308   }
 2309   return MIN2(size,max_size);
 2310 }
 2311 
 2312 const int Matcher::superword_max_vector_size(const BasicType bt) {
 2313   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2314   // by default on Cascade Lake
 2315   if (VM_Version::is_default_intel_cascade_lake()) {
 2316     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2317   }
 2318   return Matcher::max_vector_size(bt);
 2319 }
 2320 
 2321 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2322   return -1;
 2323 }
 2324 
 2325 // Vector ideal reg corresponding to specified size in bytes
 2326 const uint Matcher::vector_ideal_reg(int size) {
 2327   assert(MaxVectorSize >= size, "");
 2328   switch(size) {
 2329     case  4: return Op_VecS;
 2330     case  8: return Op_VecD;
 2331     case 16: return Op_VecX;
 2332     case 32: return Op_VecY;
 2333     case 64: return Op_VecZ;
 2334   }
 2335   ShouldNotReachHere();
 2336   return 0;
 2337 }
 2338 
 2339 // Check for shift by small constant as well
 2340 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2341   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2342       shift->in(2)->get_int() <= 3 &&
 2343       // Are there other uses besides address expressions?
 2344       !matcher->is_visited(shift)) {
 2345     address_visited.set(shift->_idx); // Flag as address_visited
 2346     mstack.push(shift->in(2), Matcher::Visit);
 2347     Node *conv = shift->in(1);
 2348 #ifdef _LP64
 2349     // Allow Matcher to match the rule which bypass
 2350     // ConvI2L operation for an array index on LP64
 2351     // if the index value is positive.
 2352     if (conv->Opcode() == Op_ConvI2L &&
 2353         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2354         // Are there other uses besides address expressions?
 2355         !matcher->is_visited(conv)) {
 2356       address_visited.set(conv->_idx); // Flag as address_visited
 2357       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2358     } else
 2359 #endif
 2360       mstack.push(conv, Matcher::Pre_Visit);
 2361     return true;
 2362   }
 2363   return false;
 2364 }
 2365 
 2366 // This function identifies sub-graphs in which a 'load' node is
 2367 // input to two different nodes, and such that it can be matched
 2368 // with BMI instructions like blsi, blsr, etc.
 2369 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2370 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2371 // refers to the same node.
 2372 //
 2373 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2374 // This is a temporary solution until we make DAGs expressible in ADL.
 2375 template<typename ConType>
 2376 class FusedPatternMatcher {
 2377   Node* _op1_node;
 2378   Node* _mop_node;
 2379   int _con_op;
 2380 
 2381   static int match_next(Node* n, int next_op, int next_op_idx) {
 2382     if (n->in(1) == NULL || n->in(2) == NULL) {
 2383       return -1;
 2384     }
 2385 
 2386     if (next_op_idx == -1) { // n is commutative, try rotations
 2387       if (n->in(1)->Opcode() == next_op) {
 2388         return 1;
 2389       } else if (n->in(2)->Opcode() == next_op) {
 2390         return 2;
 2391       }
 2392     } else {
 2393       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2394       if (n->in(next_op_idx)->Opcode() == next_op) {
 2395         return next_op_idx;
 2396       }
 2397     }
 2398     return -1;
 2399   }
 2400 
 2401  public:
 2402   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2403     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2404 
 2405   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2406              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2407              typename ConType::NativeType con_value) {
 2408     if (_op1_node->Opcode() != op1) {
 2409       return false;
 2410     }
 2411     if (_mop_node->outcnt() > 2) {
 2412       return false;
 2413     }
 2414     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2415     if (op1_op2_idx == -1) {
 2416       return false;
 2417     }
 2418     // Memory operation must be the other edge
 2419     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2420 
 2421     // Check that the mop node is really what we want
 2422     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2423       Node* op2_node = _op1_node->in(op1_op2_idx);
 2424       if (op2_node->outcnt() > 1) {
 2425         return false;
 2426       }
 2427       assert(op2_node->Opcode() == op2, "Should be");
 2428       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2429       if (op2_con_idx == -1) {
 2430         return false;
 2431       }
 2432       // Memory operation must be the other edge
 2433       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2434       // Check that the memory operation is the same node
 2435       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2436         // Now check the constant
 2437         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2438         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2439           return true;
 2440         }
 2441       }
 2442     }
 2443     return false;
 2444   }
 2445 };
 2446 
 2447 static bool is_bmi_pattern(Node* n, Node* m) {
 2448   assert(UseBMI1Instructions, "sanity");
 2449   if (n != NULL && m != NULL) {
 2450     if (m->Opcode() == Op_LoadI) {
 2451       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2452       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2453              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2454              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2455     } else if (m->Opcode() == Op_LoadL) {
 2456       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2457       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2458              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2459              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2460     }
 2461   }
 2462   return false;
 2463 }
 2464 
 2465 // Should the matcher clone input 'm' of node 'n'?
 2466 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2467   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2468   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2469     mstack.push(m, Visit);
 2470     return true;
 2471   }
 2472   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2473     mstack.push(m, Visit);           // m = ShiftCntV
 2474     return true;
 2475   }
 2476   return false;
 2477 }
 2478 
 2479 // Should the Matcher clone shifts on addressing modes, expecting them
 2480 // to be subsumed into complex addressing expressions or compute them
 2481 // into registers?
 2482 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2483   Node *off = m->in(AddPNode::Offset);
 2484   if (off->is_Con()) {
 2485     address_visited.test_set(m->_idx); // Flag as address_visited
 2486     Node *adr = m->in(AddPNode::Address);
 2487 
 2488     // Intel can handle 2 adds in addressing mode
 2489     // AtomicAdd is not an addressing expression.
 2490     // Cheap to find it by looking for screwy base.
 2491     if (adr->is_AddP() &&
 2492         !adr->in(AddPNode::Base)->is_top() &&
 2493         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2494         // Are there other uses besides address expressions?
 2495         !is_visited(adr)) {
 2496       address_visited.set(adr->_idx); // Flag as address_visited
 2497       Node *shift = adr->in(AddPNode::Offset);
 2498       if (!clone_shift(shift, this, mstack, address_visited)) {
 2499         mstack.push(shift, Pre_Visit);
 2500       }
 2501       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2502       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2503     } else {
 2504       mstack.push(adr, Pre_Visit);
 2505     }
 2506 
 2507     // Clone X+offset as it also folds into most addressing expressions
 2508     mstack.push(off, Visit);
 2509     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2510     return true;
 2511   } else if (clone_shift(off, this, mstack, address_visited)) {
 2512     address_visited.test_set(m->_idx); // Flag as address_visited
 2513     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2514     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2515     return true;
 2516   }
 2517   return false;
 2518 }
 2519 
 2520 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2521   switch (bt) {
 2522     case BoolTest::eq:
 2523       return Assembler::eq;
 2524     case BoolTest::ne:
 2525       return Assembler::neq;
 2526     case BoolTest::le:
 2527     case BoolTest::ule:
 2528       return Assembler::le;
 2529     case BoolTest::ge:
 2530     case BoolTest::uge:
 2531       return Assembler::nlt;
 2532     case BoolTest::lt:
 2533     case BoolTest::ult:
 2534       return Assembler::lt;
 2535     case BoolTest::gt:
 2536     case BoolTest::ugt:
 2537       return Assembler::nle;
 2538     default : ShouldNotReachHere(); return Assembler::_false;
 2539   }
 2540 }
 2541 
 2542 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2543   switch (bt) {
 2544   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2545   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2546   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2547   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2548   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2549   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2550   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2551   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2552   }
 2553 }
 2554 
 2555 // Helper methods for MachSpillCopyNode::implementation().
 2556 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2557                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2558   assert(ireg == Op_VecS || // 32bit vector
 2559          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2560          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2561          "no non-adjacent vector moves" );
 2562   if (cbuf) {
 2563     C2_MacroAssembler _masm(cbuf);
 2564     switch (ireg) {
 2565     case Op_VecS: // copy whole register
 2566     case Op_VecD:
 2567     case Op_VecX:
 2568 #ifndef _LP64
 2569       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2570 #else
 2571       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2572         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2573       } else {
 2574         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2575      }
 2576 #endif
 2577       break;
 2578     case Op_VecY:
 2579 #ifndef _LP64
 2580       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2581 #else
 2582       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2583         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2584       } else {
 2585         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2586      }
 2587 #endif
 2588       break;
 2589     case Op_VecZ:
 2590       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2591       break;
 2592     default:
 2593       ShouldNotReachHere();
 2594     }
 2595 #ifndef PRODUCT
 2596   } else {
 2597     switch (ireg) {
 2598     case Op_VecS:
 2599     case Op_VecD:
 2600     case Op_VecX:
 2601       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2602       break;
 2603     case Op_VecY:
 2604     case Op_VecZ:
 2605       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2606       break;
 2607     default:
 2608       ShouldNotReachHere();
 2609     }
 2610 #endif
 2611   }
 2612 }
 2613 
 2614 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2615                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2616   if (cbuf) {
 2617     C2_MacroAssembler _masm(cbuf);
 2618     if (is_load) {
 2619       switch (ireg) {
 2620       case Op_VecS:
 2621         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2622         break;
 2623       case Op_VecD:
 2624         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2625         break;
 2626       case Op_VecX:
 2627 #ifndef _LP64
 2628         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2629 #else
 2630         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2631           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2632         } else {
 2633           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2634           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2635         }
 2636 #endif
 2637         break;
 2638       case Op_VecY:
 2639 #ifndef _LP64
 2640         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2641 #else
 2642         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2643           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2644         } else {
 2645           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2646           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2647         }
 2648 #endif
 2649         break;
 2650       case Op_VecZ:
 2651         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2652         break;
 2653       default:
 2654         ShouldNotReachHere();
 2655       }
 2656     } else { // store
 2657       switch (ireg) {
 2658       case Op_VecS:
 2659         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2660         break;
 2661       case Op_VecD:
 2662         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2663         break;
 2664       case Op_VecX:
 2665 #ifndef _LP64
 2666         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2667 #else
 2668         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2669           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2670         }
 2671         else {
 2672           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2673         }
 2674 #endif
 2675         break;
 2676       case Op_VecY:
 2677 #ifndef _LP64
 2678         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2679 #else
 2680         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2681           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2682         }
 2683         else {
 2684           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2685         }
 2686 #endif
 2687         break;
 2688       case Op_VecZ:
 2689         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2690         break;
 2691       default:
 2692         ShouldNotReachHere();
 2693       }
 2694     }
 2695 #ifndef PRODUCT
 2696   } else {
 2697     if (is_load) {
 2698       switch (ireg) {
 2699       case Op_VecS:
 2700         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2701         break;
 2702       case Op_VecD:
 2703         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2704         break;
 2705        case Op_VecX:
 2706         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2707         break;
 2708       case Op_VecY:
 2709       case Op_VecZ:
 2710         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2711         break;
 2712       default:
 2713         ShouldNotReachHere();
 2714       }
 2715     } else { // store
 2716       switch (ireg) {
 2717       case Op_VecS:
 2718         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2719         break;
 2720       case Op_VecD:
 2721         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2722         break;
 2723        case Op_VecX:
 2724         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2725         break;
 2726       case Op_VecY:
 2727       case Op_VecZ:
 2728         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2729         break;
 2730       default:
 2731         ShouldNotReachHere();
 2732       }
 2733     }
 2734 #endif
 2735   }
 2736 }
 2737 
 2738 template <class T>
 2739 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2740   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2741   jvalue ele;
 2742   switch (bt) {
 2743     case T_BYTE:   ele.b = con; break;
 2744     case T_SHORT:  ele.s = con; break;
 2745     case T_INT:    ele.i = con; break;
 2746     case T_LONG:   ele.j = con; break;
 2747     case T_FLOAT:  ele.f = con; break;
 2748     case T_DOUBLE: ele.d = con; break;
 2749     default: ShouldNotReachHere();
 2750   }
 2751   for (int i = 0; i < len; i++) {
 2752     val->append(ele);
 2753   }
 2754   return val;
 2755 }
 2756 
 2757 static inline jlong high_bit_set(BasicType bt) {
 2758   switch (bt) {
 2759     case T_BYTE:  return 0x8080808080808080;
 2760     case T_SHORT: return 0x8000800080008000;
 2761     case T_INT:   return 0x8000000080000000;
 2762     case T_LONG:  return 0x8000000000000000;
 2763     default:
 2764       ShouldNotReachHere();
 2765       return 0;
 2766   }
 2767 }
 2768 
 2769 #ifndef PRODUCT
 2770   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2771     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2772   }
 2773 #endif
 2774 
 2775   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2776     C2_MacroAssembler _masm(&cbuf);
 2777     __ nop(_count);
 2778   }
 2779 
 2780   uint MachNopNode::size(PhaseRegAlloc*) const {
 2781     return _count;
 2782   }
 2783 
 2784 #ifndef PRODUCT
 2785   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2786     st->print("# breakpoint");
 2787   }
 2788 #endif
 2789 
 2790   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2791     C2_MacroAssembler _masm(&cbuf);
 2792     __ int3();
 2793   }
 2794 
 2795   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2796     return MachNode::size(ra_);
 2797   }
 2798 
 2799 %}
 2800 
 2801 encode %{
 2802 
 2803   enc_class call_epilog %{
 2804     C2_MacroAssembler _masm(&cbuf);
 2805     if (VerifyStackAtCalls) {
 2806       // Check that stack depth is unchanged: find majik cookie on stack
 2807       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2808       Label L;
 2809       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2810       __ jccb(Assembler::equal, L);
 2811       // Die if stack mismatch
 2812       __ int3();
 2813       __ bind(L);
 2814     }
 2815     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2816       C2_MacroAssembler _masm(&cbuf);
 2817       if (!_method->signature()->returns_null_free_inline_type()) {
 2818         // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2819         // Search for the corresponding projection, get the register and emit code that initialized it.
 2820         uint con = (tf()->range_cc()->cnt() - 1);
 2821         for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2822           ProjNode* proj = fast_out(i)->as_Proj();
 2823           if (proj->_con == con) {
 2824             // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2825             OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2826             VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2827             Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2828             __ testq(rax, rax);
 2829             __ set_byte_if_not_zero(toReg);
 2830             __ movzbl(toReg, toReg);
 2831             if (reg->is_stack()) {
 2832               int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2833               __ movq(Address(rsp, st_off), toReg);
 2834             }
 2835             break;
 2836           }
 2837         }
 2838       }
 2839       if (return_value_is_used()) {
 2840         // An inline type is returned as fields in multiple registers.
 2841         // Rax either contains an oop if the inline type is buffered or a pointer
 2842         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2843         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2844         // rax &= (rax & 1) - 1
 2845         __ movptr(rscratch1, rax);
 2846         __ andptr(rscratch1, 0x1);
 2847         __ subptr(rscratch1, 0x1);
 2848         __ andptr(rax, rscratch1);
 2849       }
 2850     }
 2851   %}
 2852 
 2853 %}
 2854 
 2855 // Operands for bound floating pointer register arguments
 2856 operand rxmm0() %{
 2857   constraint(ALLOC_IN_RC(xmm0_reg));
 2858   match(VecX);
 2859   format%{%}
 2860   interface(REG_INTER);
 2861 %}
 2862 
 2863 //----------OPERANDS-----------------------------------------------------------
 2864 // Operand definitions must precede instruction definitions for correct parsing
 2865 // in the ADLC because operands constitute user defined types which are used in
 2866 // instruction definitions.
 2867 
 2868 // Vectors
 2869 
 2870 // Dummy generic vector class. Should be used for all vector operands.
 2871 // Replaced with vec[SDXYZ] during post-selection pass.
 2872 operand vec() %{
 2873   constraint(ALLOC_IN_RC(dynamic));
 2874   match(VecX);
 2875   match(VecY);
 2876   match(VecZ);
 2877   match(VecS);
 2878   match(VecD);
 2879 
 2880   format %{ %}
 2881   interface(REG_INTER);
 2882 %}
 2883 
 2884 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2885 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2886 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2887 // runtime code generation via reg_class_dynamic.
 2888 operand legVec() %{
 2889   constraint(ALLOC_IN_RC(dynamic));
 2890   match(VecX);
 2891   match(VecY);
 2892   match(VecZ);
 2893   match(VecS);
 2894   match(VecD);
 2895 
 2896   format %{ %}
 2897   interface(REG_INTER);
 2898 %}
 2899 
 2900 // Replaces vec during post-selection cleanup. See above.
 2901 operand vecS() %{
 2902   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2903   match(VecS);
 2904 
 2905   format %{ %}
 2906   interface(REG_INTER);
 2907 %}
 2908 
 2909 // Replaces legVec during post-selection cleanup. See above.
 2910 operand legVecS() %{
 2911   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2912   match(VecS);
 2913 
 2914   format %{ %}
 2915   interface(REG_INTER);
 2916 %}
 2917 
 2918 // Replaces vec during post-selection cleanup. See above.
 2919 operand vecD() %{
 2920   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2921   match(VecD);
 2922 
 2923   format %{ %}
 2924   interface(REG_INTER);
 2925 %}
 2926 
 2927 // Replaces legVec during post-selection cleanup. See above.
 2928 operand legVecD() %{
 2929   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2930   match(VecD);
 2931 
 2932   format %{ %}
 2933   interface(REG_INTER);
 2934 %}
 2935 
 2936 // Replaces vec during post-selection cleanup. See above.
 2937 operand vecX() %{
 2938   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2939   match(VecX);
 2940 
 2941   format %{ %}
 2942   interface(REG_INTER);
 2943 %}
 2944 
 2945 // Replaces legVec during post-selection cleanup. See above.
 2946 operand legVecX() %{
 2947   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2948   match(VecX);
 2949 
 2950   format %{ %}
 2951   interface(REG_INTER);
 2952 %}
 2953 
 2954 // Replaces vec during post-selection cleanup. See above.
 2955 operand vecY() %{
 2956   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2957   match(VecY);
 2958 
 2959   format %{ %}
 2960   interface(REG_INTER);
 2961 %}
 2962 
 2963 // Replaces legVec during post-selection cleanup. See above.
 2964 operand legVecY() %{
 2965   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2966   match(VecY);
 2967 
 2968   format %{ %}
 2969   interface(REG_INTER);
 2970 %}
 2971 
 2972 // Replaces vec during post-selection cleanup. See above.
 2973 operand vecZ() %{
 2974   constraint(ALLOC_IN_RC(vectorz_reg));
 2975   match(VecZ);
 2976 
 2977   format %{ %}
 2978   interface(REG_INTER);
 2979 %}
 2980 
 2981 // Replaces legVec during post-selection cleanup. See above.
 2982 operand legVecZ() %{
 2983   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2984   match(VecZ);
 2985 
 2986   format %{ %}
 2987   interface(REG_INTER);
 2988 %}
 2989 
 2990 // Comparison Code for FP conditional move
 2991 operand cmpOp_vcmppd() %{
 2992   match(Bool);
 2993 
 2994   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
 2995             n->as_Bool()->_test._test != BoolTest::no_overflow);
 2996   format %{ "" %}
 2997   interface(COND_INTER) %{
 2998     equal        (0x0, "eq");
 2999     less         (0x1, "lt");
 3000     less_equal   (0x2, "le");
 3001     not_equal    (0xC, "ne");
 3002     greater_equal(0xD, "ge");
 3003     greater      (0xE, "gt");
 3004     //TODO cannot compile (adlc breaks) without two next lines with error:
 3005     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
 3006     // equal' for overflow.
 3007     overflow     (0x20, "o");  // not really supported by the instruction
 3008     no_overflow  (0x21, "no"); // not really supported by the instruction
 3009   %}
 3010 %}
 3011 
 3012 
 3013 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 3014 
 3015 // ============================================================================
 3016 
 3017 instruct ShouldNotReachHere() %{
 3018   match(Halt);
 3019   format %{ "stop\t# ShouldNotReachHere" %}
 3020   ins_encode %{
 3021     if (is_reachable()) {
 3022       __ stop(_halt_reason);
 3023     }
 3024   %}
 3025   ins_pipe(pipe_slow);
 3026 %}
 3027 
 3028 // ============================================================================
 3029 
 3030 instruct addF_reg(regF dst, regF src) %{
 3031   predicate((UseSSE>=1) && (UseAVX == 0));
 3032   match(Set dst (AddF dst src));
 3033 
 3034   format %{ "addss   $dst, $src" %}
 3035   ins_cost(150);
 3036   ins_encode %{
 3037     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3038   %}
 3039   ins_pipe(pipe_slow);
 3040 %}
 3041 
 3042 instruct addF_mem(regF dst, memory src) %{
 3043   predicate((UseSSE>=1) && (UseAVX == 0));
 3044   match(Set dst (AddF dst (LoadF src)));
 3045 
 3046   format %{ "addss   $dst, $src" %}
 3047   ins_cost(150);
 3048   ins_encode %{
 3049     __ addss($dst$$XMMRegister, $src$$Address);
 3050   %}
 3051   ins_pipe(pipe_slow);
 3052 %}
 3053 
 3054 instruct addF_imm(regF dst, immF con) %{
 3055   predicate((UseSSE>=1) && (UseAVX == 0));
 3056   match(Set dst (AddF dst con));
 3057   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3058   ins_cost(150);
 3059   ins_encode %{
 3060     __ addss($dst$$XMMRegister, $constantaddress($con));
 3061   %}
 3062   ins_pipe(pipe_slow);
 3063 %}
 3064 
 3065 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3066   predicate(UseAVX > 0);
 3067   match(Set dst (AddF src1 src2));
 3068 
 3069   format %{ "vaddss  $dst, $src1, $src2" %}
 3070   ins_cost(150);
 3071   ins_encode %{
 3072     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3073   %}
 3074   ins_pipe(pipe_slow);
 3075 %}
 3076 
 3077 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3078   predicate(UseAVX > 0);
 3079   match(Set dst (AddF src1 (LoadF src2)));
 3080 
 3081   format %{ "vaddss  $dst, $src1, $src2" %}
 3082   ins_cost(150);
 3083   ins_encode %{
 3084     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3085   %}
 3086   ins_pipe(pipe_slow);
 3087 %}
 3088 
 3089 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3090   predicate(UseAVX > 0);
 3091   match(Set dst (AddF src con));
 3092 
 3093   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3094   ins_cost(150);
 3095   ins_encode %{
 3096     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3097   %}
 3098   ins_pipe(pipe_slow);
 3099 %}
 3100 
 3101 instruct addD_reg(regD dst, regD src) %{
 3102   predicate((UseSSE>=2) && (UseAVX == 0));
 3103   match(Set dst (AddD dst src));
 3104 
 3105   format %{ "addsd   $dst, $src" %}
 3106   ins_cost(150);
 3107   ins_encode %{
 3108     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3109   %}
 3110   ins_pipe(pipe_slow);
 3111 %}
 3112 
 3113 instruct addD_mem(regD dst, memory src) %{
 3114   predicate((UseSSE>=2) && (UseAVX == 0));
 3115   match(Set dst (AddD dst (LoadD src)));
 3116 
 3117   format %{ "addsd   $dst, $src" %}
 3118   ins_cost(150);
 3119   ins_encode %{
 3120     __ addsd($dst$$XMMRegister, $src$$Address);
 3121   %}
 3122   ins_pipe(pipe_slow);
 3123 %}
 3124 
 3125 instruct addD_imm(regD dst, immD con) %{
 3126   predicate((UseSSE>=2) && (UseAVX == 0));
 3127   match(Set dst (AddD dst con));
 3128   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3129   ins_cost(150);
 3130   ins_encode %{
 3131     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3132   %}
 3133   ins_pipe(pipe_slow);
 3134 %}
 3135 
 3136 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3137   predicate(UseAVX > 0);
 3138   match(Set dst (AddD src1 src2));
 3139 
 3140   format %{ "vaddsd  $dst, $src1, $src2" %}
 3141   ins_cost(150);
 3142   ins_encode %{
 3143     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3144   %}
 3145   ins_pipe(pipe_slow);
 3146 %}
 3147 
 3148 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3149   predicate(UseAVX > 0);
 3150   match(Set dst (AddD src1 (LoadD src2)));
 3151 
 3152   format %{ "vaddsd  $dst, $src1, $src2" %}
 3153   ins_cost(150);
 3154   ins_encode %{
 3155     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3156   %}
 3157   ins_pipe(pipe_slow);
 3158 %}
 3159 
 3160 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3161   predicate(UseAVX > 0);
 3162   match(Set dst (AddD src con));
 3163 
 3164   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3165   ins_cost(150);
 3166   ins_encode %{
 3167     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3168   %}
 3169   ins_pipe(pipe_slow);
 3170 %}
 3171 
 3172 instruct subF_reg(regF dst, regF src) %{
 3173   predicate((UseSSE>=1) && (UseAVX == 0));
 3174   match(Set dst (SubF dst src));
 3175 
 3176   format %{ "subss   $dst, $src" %}
 3177   ins_cost(150);
 3178   ins_encode %{
 3179     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3180   %}
 3181   ins_pipe(pipe_slow);
 3182 %}
 3183 
 3184 instruct subF_mem(regF dst, memory src) %{
 3185   predicate((UseSSE>=1) && (UseAVX == 0));
 3186   match(Set dst (SubF dst (LoadF src)));
 3187 
 3188   format %{ "subss   $dst, $src" %}
 3189   ins_cost(150);
 3190   ins_encode %{
 3191     __ subss($dst$$XMMRegister, $src$$Address);
 3192   %}
 3193   ins_pipe(pipe_slow);
 3194 %}
 3195 
 3196 instruct subF_imm(regF dst, immF con) %{
 3197   predicate((UseSSE>=1) && (UseAVX == 0));
 3198   match(Set dst (SubF dst con));
 3199   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3200   ins_cost(150);
 3201   ins_encode %{
 3202     __ subss($dst$$XMMRegister, $constantaddress($con));
 3203   %}
 3204   ins_pipe(pipe_slow);
 3205 %}
 3206 
 3207 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3208   predicate(UseAVX > 0);
 3209   match(Set dst (SubF src1 src2));
 3210 
 3211   format %{ "vsubss  $dst, $src1, $src2" %}
 3212   ins_cost(150);
 3213   ins_encode %{
 3214     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3215   %}
 3216   ins_pipe(pipe_slow);
 3217 %}
 3218 
 3219 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3220   predicate(UseAVX > 0);
 3221   match(Set dst (SubF src1 (LoadF src2)));
 3222 
 3223   format %{ "vsubss  $dst, $src1, $src2" %}
 3224   ins_cost(150);
 3225   ins_encode %{
 3226     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3227   %}
 3228   ins_pipe(pipe_slow);
 3229 %}
 3230 
 3231 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3232   predicate(UseAVX > 0);
 3233   match(Set dst (SubF src con));
 3234 
 3235   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3236   ins_cost(150);
 3237   ins_encode %{
 3238     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3239   %}
 3240   ins_pipe(pipe_slow);
 3241 %}
 3242 
 3243 instruct subD_reg(regD dst, regD src) %{
 3244   predicate((UseSSE>=2) && (UseAVX == 0));
 3245   match(Set dst (SubD dst src));
 3246 
 3247   format %{ "subsd   $dst, $src" %}
 3248   ins_cost(150);
 3249   ins_encode %{
 3250     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3251   %}
 3252   ins_pipe(pipe_slow);
 3253 %}
 3254 
 3255 instruct subD_mem(regD dst, memory src) %{
 3256   predicate((UseSSE>=2) && (UseAVX == 0));
 3257   match(Set dst (SubD dst (LoadD src)));
 3258 
 3259   format %{ "subsd   $dst, $src" %}
 3260   ins_cost(150);
 3261   ins_encode %{
 3262     __ subsd($dst$$XMMRegister, $src$$Address);
 3263   %}
 3264   ins_pipe(pipe_slow);
 3265 %}
 3266 
 3267 instruct subD_imm(regD dst, immD con) %{
 3268   predicate((UseSSE>=2) && (UseAVX == 0));
 3269   match(Set dst (SubD dst con));
 3270   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3271   ins_cost(150);
 3272   ins_encode %{
 3273     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3274   %}
 3275   ins_pipe(pipe_slow);
 3276 %}
 3277 
 3278 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3279   predicate(UseAVX > 0);
 3280   match(Set dst (SubD src1 src2));
 3281 
 3282   format %{ "vsubsd  $dst, $src1, $src2" %}
 3283   ins_cost(150);
 3284   ins_encode %{
 3285     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3286   %}
 3287   ins_pipe(pipe_slow);
 3288 %}
 3289 
 3290 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3291   predicate(UseAVX > 0);
 3292   match(Set dst (SubD src1 (LoadD src2)));
 3293 
 3294   format %{ "vsubsd  $dst, $src1, $src2" %}
 3295   ins_cost(150);
 3296   ins_encode %{
 3297     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3298   %}
 3299   ins_pipe(pipe_slow);
 3300 %}
 3301 
 3302 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3303   predicate(UseAVX > 0);
 3304   match(Set dst (SubD src con));
 3305 
 3306   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3307   ins_cost(150);
 3308   ins_encode %{
 3309     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3310   %}
 3311   ins_pipe(pipe_slow);
 3312 %}
 3313 
 3314 instruct mulF_reg(regF dst, regF src) %{
 3315   predicate((UseSSE>=1) && (UseAVX == 0));
 3316   match(Set dst (MulF dst src));
 3317 
 3318   format %{ "mulss   $dst, $src" %}
 3319   ins_cost(150);
 3320   ins_encode %{
 3321     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3322   %}
 3323   ins_pipe(pipe_slow);
 3324 %}
 3325 
 3326 instruct mulF_mem(regF dst, memory src) %{
 3327   predicate((UseSSE>=1) && (UseAVX == 0));
 3328   match(Set dst (MulF dst (LoadF src)));
 3329 
 3330   format %{ "mulss   $dst, $src" %}
 3331   ins_cost(150);
 3332   ins_encode %{
 3333     __ mulss($dst$$XMMRegister, $src$$Address);
 3334   %}
 3335   ins_pipe(pipe_slow);
 3336 %}
 3337 
 3338 instruct mulF_imm(regF dst, immF con) %{
 3339   predicate((UseSSE>=1) && (UseAVX == 0));
 3340   match(Set dst (MulF dst con));
 3341   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3342   ins_cost(150);
 3343   ins_encode %{
 3344     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3345   %}
 3346   ins_pipe(pipe_slow);
 3347 %}
 3348 
 3349 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3350   predicate(UseAVX > 0);
 3351   match(Set dst (MulF src1 src2));
 3352 
 3353   format %{ "vmulss  $dst, $src1, $src2" %}
 3354   ins_cost(150);
 3355   ins_encode %{
 3356     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3357   %}
 3358   ins_pipe(pipe_slow);
 3359 %}
 3360 
 3361 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3362   predicate(UseAVX > 0);
 3363   match(Set dst (MulF src1 (LoadF src2)));
 3364 
 3365   format %{ "vmulss  $dst, $src1, $src2" %}
 3366   ins_cost(150);
 3367   ins_encode %{
 3368     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3369   %}
 3370   ins_pipe(pipe_slow);
 3371 %}
 3372 
 3373 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3374   predicate(UseAVX > 0);
 3375   match(Set dst (MulF src con));
 3376 
 3377   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3378   ins_cost(150);
 3379   ins_encode %{
 3380     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3381   %}
 3382   ins_pipe(pipe_slow);
 3383 %}
 3384 
 3385 instruct mulD_reg(regD dst, regD src) %{
 3386   predicate((UseSSE>=2) && (UseAVX == 0));
 3387   match(Set dst (MulD dst src));
 3388 
 3389   format %{ "mulsd   $dst, $src" %}
 3390   ins_cost(150);
 3391   ins_encode %{
 3392     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3393   %}
 3394   ins_pipe(pipe_slow);
 3395 %}
 3396 
 3397 instruct mulD_mem(regD dst, memory src) %{
 3398   predicate((UseSSE>=2) && (UseAVX == 0));
 3399   match(Set dst (MulD dst (LoadD src)));
 3400 
 3401   format %{ "mulsd   $dst, $src" %}
 3402   ins_cost(150);
 3403   ins_encode %{
 3404     __ mulsd($dst$$XMMRegister, $src$$Address);
 3405   %}
 3406   ins_pipe(pipe_slow);
 3407 %}
 3408 
 3409 instruct mulD_imm(regD dst, immD con) %{
 3410   predicate((UseSSE>=2) && (UseAVX == 0));
 3411   match(Set dst (MulD dst con));
 3412   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3413   ins_cost(150);
 3414   ins_encode %{
 3415     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3416   %}
 3417   ins_pipe(pipe_slow);
 3418 %}
 3419 
 3420 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3421   predicate(UseAVX > 0);
 3422   match(Set dst (MulD src1 src2));
 3423 
 3424   format %{ "vmulsd  $dst, $src1, $src2" %}
 3425   ins_cost(150);
 3426   ins_encode %{
 3427     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3428   %}
 3429   ins_pipe(pipe_slow);
 3430 %}
 3431 
 3432 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3433   predicate(UseAVX > 0);
 3434   match(Set dst (MulD src1 (LoadD src2)));
 3435 
 3436   format %{ "vmulsd  $dst, $src1, $src2" %}
 3437   ins_cost(150);
 3438   ins_encode %{
 3439     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3440   %}
 3441   ins_pipe(pipe_slow);
 3442 %}
 3443 
 3444 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3445   predicate(UseAVX > 0);
 3446   match(Set dst (MulD src con));
 3447 
 3448   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3449   ins_cost(150);
 3450   ins_encode %{
 3451     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3452   %}
 3453   ins_pipe(pipe_slow);
 3454 %}
 3455 
 3456 instruct divF_reg(regF dst, regF src) %{
 3457   predicate((UseSSE>=1) && (UseAVX == 0));
 3458   match(Set dst (DivF dst src));
 3459 
 3460   format %{ "divss   $dst, $src" %}
 3461   ins_cost(150);
 3462   ins_encode %{
 3463     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3464   %}
 3465   ins_pipe(pipe_slow);
 3466 %}
 3467 
 3468 instruct divF_mem(regF dst, memory src) %{
 3469   predicate((UseSSE>=1) && (UseAVX == 0));
 3470   match(Set dst (DivF dst (LoadF src)));
 3471 
 3472   format %{ "divss   $dst, $src" %}
 3473   ins_cost(150);
 3474   ins_encode %{
 3475     __ divss($dst$$XMMRegister, $src$$Address);
 3476   %}
 3477   ins_pipe(pipe_slow);
 3478 %}
 3479 
 3480 instruct divF_imm(regF dst, immF con) %{
 3481   predicate((UseSSE>=1) && (UseAVX == 0));
 3482   match(Set dst (DivF dst con));
 3483   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3484   ins_cost(150);
 3485   ins_encode %{
 3486     __ divss($dst$$XMMRegister, $constantaddress($con));
 3487   %}
 3488   ins_pipe(pipe_slow);
 3489 %}
 3490 
 3491 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3492   predicate(UseAVX > 0);
 3493   match(Set dst (DivF src1 src2));
 3494 
 3495   format %{ "vdivss  $dst, $src1, $src2" %}
 3496   ins_cost(150);
 3497   ins_encode %{
 3498     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3499   %}
 3500   ins_pipe(pipe_slow);
 3501 %}
 3502 
 3503 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3504   predicate(UseAVX > 0);
 3505   match(Set dst (DivF src1 (LoadF src2)));
 3506 
 3507   format %{ "vdivss  $dst, $src1, $src2" %}
 3508   ins_cost(150);
 3509   ins_encode %{
 3510     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3511   %}
 3512   ins_pipe(pipe_slow);
 3513 %}
 3514 
 3515 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3516   predicate(UseAVX > 0);
 3517   match(Set dst (DivF src con));
 3518 
 3519   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3520   ins_cost(150);
 3521   ins_encode %{
 3522     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3523   %}
 3524   ins_pipe(pipe_slow);
 3525 %}
 3526 
 3527 instruct divD_reg(regD dst, regD src) %{
 3528   predicate((UseSSE>=2) && (UseAVX == 0));
 3529   match(Set dst (DivD dst src));
 3530 
 3531   format %{ "divsd   $dst, $src" %}
 3532   ins_cost(150);
 3533   ins_encode %{
 3534     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3535   %}
 3536   ins_pipe(pipe_slow);
 3537 %}
 3538 
 3539 instruct divD_mem(regD dst, memory src) %{
 3540   predicate((UseSSE>=2) && (UseAVX == 0));
 3541   match(Set dst (DivD dst (LoadD src)));
 3542 
 3543   format %{ "divsd   $dst, $src" %}
 3544   ins_cost(150);
 3545   ins_encode %{
 3546     __ divsd($dst$$XMMRegister, $src$$Address);
 3547   %}
 3548   ins_pipe(pipe_slow);
 3549 %}
 3550 
 3551 instruct divD_imm(regD dst, immD con) %{
 3552   predicate((UseSSE>=2) && (UseAVX == 0));
 3553   match(Set dst (DivD dst con));
 3554   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3555   ins_cost(150);
 3556   ins_encode %{
 3557     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3558   %}
 3559   ins_pipe(pipe_slow);
 3560 %}
 3561 
 3562 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3563   predicate(UseAVX > 0);
 3564   match(Set dst (DivD src1 src2));
 3565 
 3566   format %{ "vdivsd  $dst, $src1, $src2" %}
 3567   ins_cost(150);
 3568   ins_encode %{
 3569     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3570   %}
 3571   ins_pipe(pipe_slow);
 3572 %}
 3573 
 3574 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3575   predicate(UseAVX > 0);
 3576   match(Set dst (DivD src1 (LoadD src2)));
 3577 
 3578   format %{ "vdivsd  $dst, $src1, $src2" %}
 3579   ins_cost(150);
 3580   ins_encode %{
 3581     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3582   %}
 3583   ins_pipe(pipe_slow);
 3584 %}
 3585 
 3586 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3587   predicate(UseAVX > 0);
 3588   match(Set dst (DivD src con));
 3589 
 3590   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3591   ins_cost(150);
 3592   ins_encode %{
 3593     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3594   %}
 3595   ins_pipe(pipe_slow);
 3596 %}
 3597 
 3598 instruct absF_reg(regF dst) %{
 3599   predicate((UseSSE>=1) && (UseAVX == 0));
 3600   match(Set dst (AbsF dst));
 3601   ins_cost(150);
 3602   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3603   ins_encode %{
 3604     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3605   %}
 3606   ins_pipe(pipe_slow);
 3607 %}
 3608 
 3609 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3610   predicate(UseAVX > 0);
 3611   match(Set dst (AbsF src));
 3612   ins_cost(150);
 3613   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3614   ins_encode %{
 3615     int vlen_enc = Assembler::AVX_128bit;
 3616     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3617               ExternalAddress(float_signmask()), vlen_enc);
 3618   %}
 3619   ins_pipe(pipe_slow);
 3620 %}
 3621 
 3622 instruct absD_reg(regD dst) %{
 3623   predicate((UseSSE>=2) && (UseAVX == 0));
 3624   match(Set dst (AbsD dst));
 3625   ins_cost(150);
 3626   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3627             "# abs double by sign masking" %}
 3628   ins_encode %{
 3629     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3630   %}
 3631   ins_pipe(pipe_slow);
 3632 %}
 3633 
 3634 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3635   predicate(UseAVX > 0);
 3636   match(Set dst (AbsD src));
 3637   ins_cost(150);
 3638   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3639             "# abs double by sign masking" %}
 3640   ins_encode %{
 3641     int vlen_enc = Assembler::AVX_128bit;
 3642     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3643               ExternalAddress(double_signmask()), vlen_enc);
 3644   %}
 3645   ins_pipe(pipe_slow);
 3646 %}
 3647 
 3648 instruct negF_reg(regF dst) %{
 3649   predicate((UseSSE>=1) && (UseAVX == 0));
 3650   match(Set dst (NegF dst));
 3651   ins_cost(150);
 3652   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3653   ins_encode %{
 3654     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3655   %}
 3656   ins_pipe(pipe_slow);
 3657 %}
 3658 
 3659 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3660   predicate(UseAVX > 0);
 3661   match(Set dst (NegF src));
 3662   ins_cost(150);
 3663   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3664   ins_encode %{
 3665     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3666                  ExternalAddress(float_signflip()));
 3667   %}
 3668   ins_pipe(pipe_slow);
 3669 %}
 3670 
 3671 instruct negD_reg(regD dst) %{
 3672   predicate((UseSSE>=2) && (UseAVX == 0));
 3673   match(Set dst (NegD dst));
 3674   ins_cost(150);
 3675   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3676             "# neg double by sign flipping" %}
 3677   ins_encode %{
 3678     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3679   %}
 3680   ins_pipe(pipe_slow);
 3681 %}
 3682 
 3683 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3684   predicate(UseAVX > 0);
 3685   match(Set dst (NegD src));
 3686   ins_cost(150);
 3687   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3688             "# neg double by sign flipping" %}
 3689   ins_encode %{
 3690     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3691                  ExternalAddress(double_signflip()));
 3692   %}
 3693   ins_pipe(pipe_slow);
 3694 %}
 3695 
 3696 // sqrtss instruction needs destination register to be pre initialized for best performance
 3697 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3698 instruct sqrtF_reg(regF dst) %{
 3699   predicate(UseSSE>=1);
 3700   match(Set dst (SqrtF dst));
 3701   format %{ "sqrtss  $dst, $dst" %}
 3702   ins_encode %{
 3703     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3704   %}
 3705   ins_pipe(pipe_slow);
 3706 %}
 3707 
 3708 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3709 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3710 instruct sqrtD_reg(regD dst) %{
 3711   predicate(UseSSE>=2);
 3712   match(Set dst (SqrtD dst));
 3713   format %{ "sqrtsd  $dst, $dst" %}
 3714   ins_encode %{
 3715     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3716   %}
 3717   ins_pipe(pipe_slow);
 3718 %}
 3719 
 3720 instruct convF2HF_reg_reg(rRegI dst, regF src, regF tmp) %{
 3721   effect(TEMP tmp);
 3722   match(Set dst (ConvF2HF src));
 3723   ins_cost(125);
 3724   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3725   ins_encode %{
 3726     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3727   %}
 3728   ins_pipe( pipe_slow );
 3729 %}
 3730 
 3731 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3732   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3733   effect(TEMP ktmp, TEMP rtmp);
 3734   match(Set mem (StoreC mem (ConvF2HF src)));
 3735   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3736   ins_encode %{
 3737     __ movl($rtmp$$Register, 0x1);
 3738     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3739     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3740   %}
 3741   ins_pipe( pipe_slow );
 3742 %}
 3743 
 3744 instruct vconvF2HF(vec dst, vec src) %{
 3745   match(Set dst (VectorCastF2HF src));
 3746   format %{ "vector_conv_F2HF $dst $src" %}
 3747   ins_encode %{
 3748     int vlen_enc = vector_length_encoding(this, $src);
 3749     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3750   %}
 3751   ins_pipe( pipe_slow );
 3752 %}
 3753 
 3754 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3755   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3756   format %{ "vcvtps2ph $mem,$src" %}
 3757   ins_encode %{
 3758     int vlen_enc = vector_length_encoding(this, $src);
 3759     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3760   %}
 3761   ins_pipe( pipe_slow );
 3762 %}
 3763 
 3764 instruct convHF2F_reg_reg(regF dst, rRegI src) %{
 3765   match(Set dst (ConvHF2F src));
 3766   format %{ "vcvtph2ps $dst,$src" %}
 3767   ins_encode %{
 3768     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3769   %}
 3770   ins_pipe( pipe_slow );
 3771 %}
 3772 
 3773 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3774   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3775   format %{ "vcvtph2ps $dst,$mem" %}
 3776   ins_encode %{
 3777     int vlen_enc = vector_length_encoding(this);
 3778     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3779   %}
 3780   ins_pipe( pipe_slow );
 3781 %}
 3782 
 3783 instruct vconvHF2F(vec dst, vec src) %{
 3784   match(Set dst (VectorCastHF2F src));
 3785   ins_cost(125);
 3786   format %{ "vector_conv_HF2F $dst,$src" %}
 3787   ins_encode %{
 3788     int vlen_enc = vector_length_encoding(this);
 3789     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3790   %}
 3791   ins_pipe( pipe_slow );
 3792 %}
 3793 
 3794 // ---------------------------------------- VectorReinterpret ------------------------------------
 3795 instruct reinterpret_mask(kReg dst) %{
 3796   predicate(n->bottom_type()->isa_vectmask() &&
 3797             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3798   match(Set dst (VectorReinterpret dst));
 3799   ins_cost(125);
 3800   format %{ "vector_reinterpret $dst\t!" %}
 3801   ins_encode %{
 3802     // empty
 3803   %}
 3804   ins_pipe( pipe_slow );
 3805 %}
 3806 
 3807 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3808   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3809             n->bottom_type()->isa_vectmask() &&
 3810             n->in(1)->bottom_type()->isa_vectmask() &&
 3811             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3812             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3813   match(Set dst (VectorReinterpret src));
 3814   effect(TEMP xtmp);
 3815   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3816   ins_encode %{
 3817      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3818      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3819      assert(src_sz == dst_sz , "src and dst size mismatch");
 3820      int vlen_enc = vector_length_encoding(src_sz);
 3821      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3822      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3823   %}
 3824   ins_pipe( pipe_slow );
 3825 %}
 3826 
 3827 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3828   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3829             n->bottom_type()->isa_vectmask() &&
 3830             n->in(1)->bottom_type()->isa_vectmask() &&
 3831             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3832              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3833             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3834   match(Set dst (VectorReinterpret src));
 3835   effect(TEMP xtmp);
 3836   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3837   ins_encode %{
 3838      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3839      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3840      assert(src_sz == dst_sz , "src and dst size mismatch");
 3841      int vlen_enc = vector_length_encoding(src_sz);
 3842      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3843      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3844   %}
 3845   ins_pipe( pipe_slow );
 3846 %}
 3847 
 3848 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3849   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3850             n->bottom_type()->isa_vectmask() &&
 3851             n->in(1)->bottom_type()->isa_vectmask() &&
 3852             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3853              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3854             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3855   match(Set dst (VectorReinterpret src));
 3856   effect(TEMP xtmp);
 3857   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3858   ins_encode %{
 3859      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3860      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3861      assert(src_sz == dst_sz , "src and dst size mismatch");
 3862      int vlen_enc = vector_length_encoding(src_sz);
 3863      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3864      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3865   %}
 3866   ins_pipe( pipe_slow );
 3867 %}
 3868 
 3869 instruct reinterpret(vec dst) %{
 3870   predicate(!n->bottom_type()->isa_vectmask() &&
 3871             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3872   match(Set dst (VectorReinterpret dst));
 3873   ins_cost(125);
 3874   format %{ "vector_reinterpret $dst\t!" %}
 3875   ins_encode %{
 3876     // empty
 3877   %}
 3878   ins_pipe( pipe_slow );
 3879 %}
 3880 
 3881 instruct reinterpret_expand(vec dst, vec src) %{
 3882   predicate(UseAVX == 0 &&
 3883             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3884   match(Set dst (VectorReinterpret src));
 3885   ins_cost(125);
 3886   effect(TEMP dst);
 3887   format %{ "vector_reinterpret_expand $dst,$src" %}
 3888   ins_encode %{
 3889     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3890     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3891 
 3892     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3893     if (src_vlen_in_bytes == 4) {
 3894       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3895     } else {
 3896       assert(src_vlen_in_bytes == 8, "");
 3897       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3898     }
 3899     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3900   %}
 3901   ins_pipe( pipe_slow );
 3902 %}
 3903 
 3904 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3905   predicate(UseAVX > 0 &&
 3906             !n->bottom_type()->isa_vectmask() &&
 3907             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3908             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3909   match(Set dst (VectorReinterpret src));
 3910   ins_cost(125);
 3911   format %{ "vector_reinterpret_expand $dst,$src" %}
 3912   ins_encode %{
 3913     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3914   %}
 3915   ins_pipe( pipe_slow );
 3916 %}
 3917 
 3918 
 3919 instruct vreinterpret_expand(legVec dst, vec src) %{
 3920   predicate(UseAVX > 0 &&
 3921             !n->bottom_type()->isa_vectmask() &&
 3922             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3923             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3924   match(Set dst (VectorReinterpret src));
 3925   ins_cost(125);
 3926   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3927   ins_encode %{
 3928     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3929       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3930       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3931       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3932       default: ShouldNotReachHere();
 3933     }
 3934   %}
 3935   ins_pipe( pipe_slow );
 3936 %}
 3937 
 3938 instruct reinterpret_shrink(vec dst, legVec src) %{
 3939   predicate(!n->bottom_type()->isa_vectmask() &&
 3940             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3941   match(Set dst (VectorReinterpret src));
 3942   ins_cost(125);
 3943   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3944   ins_encode %{
 3945     switch (Matcher::vector_length_in_bytes(this)) {
 3946       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3947       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3948       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3949       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3950       default: ShouldNotReachHere();
 3951     }
 3952   %}
 3953   ins_pipe( pipe_slow );
 3954 %}
 3955 
 3956 // ----------------------------------------------------------------------------------------------------
 3957 
 3958 #ifdef _LP64
 3959 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3960   match(Set dst (RoundDoubleMode src rmode));
 3961   format %{ "roundsd $dst,$src" %}
 3962   ins_cost(150);
 3963   ins_encode %{
 3964     assert(UseSSE >= 4, "required");
 3965     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3966   %}
 3967   ins_pipe(pipe_slow);
 3968 %}
 3969 
 3970 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3971   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3972   format %{ "roundsd $dst,$src" %}
 3973   ins_cost(150);
 3974   ins_encode %{
 3975     assert(UseSSE >= 4, "required");
 3976     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3977   %}
 3978   ins_pipe(pipe_slow);
 3979 %}
 3980 
 3981 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3982   match(Set dst (RoundDoubleMode con rmode));
 3983   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3984   ins_cost(150);
 3985   ins_encode %{
 3986     assert(UseSSE >= 4, "required");
 3987     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3988   %}
 3989   ins_pipe(pipe_slow);
 3990 %}
 3991 
 3992 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3993   predicate(Matcher::vector_length(n) < 8);
 3994   match(Set dst (RoundDoubleModeV src rmode));
 3995   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3996   ins_encode %{
 3997     assert(UseAVX > 0, "required");
 3998     int vlen_enc = vector_length_encoding(this);
 3999     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 4000   %}
 4001   ins_pipe( pipe_slow );
 4002 %}
 4003 
 4004 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 4005   predicate(Matcher::vector_length(n) == 8);
 4006   match(Set dst (RoundDoubleModeV src rmode));
 4007   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 4008   ins_encode %{
 4009     assert(UseAVX > 2, "required");
 4010     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 4011   %}
 4012   ins_pipe( pipe_slow );
 4013 %}
 4014 
 4015 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 4016   predicate(Matcher::vector_length(n) < 8);
 4017   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4018   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 4019   ins_encode %{
 4020     assert(UseAVX > 0, "required");
 4021     int vlen_enc = vector_length_encoding(this);
 4022     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 4023   %}
 4024   ins_pipe( pipe_slow );
 4025 %}
 4026 
 4027 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 4028   predicate(Matcher::vector_length(n) == 8);
 4029   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4030   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 4031   ins_encode %{
 4032     assert(UseAVX > 2, "required");
 4033     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 4034   %}
 4035   ins_pipe( pipe_slow );
 4036 %}
 4037 #endif // _LP64
 4038 
 4039 instruct onspinwait() %{
 4040   match(OnSpinWait);
 4041   ins_cost(200);
 4042 
 4043   format %{
 4044     $$template
 4045     $$emit$$"pause\t! membar_onspinwait"
 4046   %}
 4047   ins_encode %{
 4048     __ pause();
 4049   %}
 4050   ins_pipe(pipe_slow);
 4051 %}
 4052 
 4053 // a * b + c
 4054 instruct fmaD_reg(regD a, regD b, regD c) %{
 4055   predicate(UseFMA);
 4056   match(Set c (FmaD  c (Binary a b)));
 4057   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4058   ins_cost(150);
 4059   ins_encode %{
 4060     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4061   %}
 4062   ins_pipe( pipe_slow );
 4063 %}
 4064 
 4065 // a * b + c
 4066 instruct fmaF_reg(regF a, regF b, regF c) %{
 4067   predicate(UseFMA);
 4068   match(Set c (FmaF  c (Binary a b)));
 4069   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4070   ins_cost(150);
 4071   ins_encode %{
 4072     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4073   %}
 4074   ins_pipe( pipe_slow );
 4075 %}
 4076 
 4077 // ====================VECTOR INSTRUCTIONS=====================================
 4078 
 4079 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4080 instruct MoveVec2Leg(legVec dst, vec src) %{
 4081   match(Set dst src);
 4082   format %{ "" %}
 4083   ins_encode %{
 4084     ShouldNotReachHere();
 4085   %}
 4086   ins_pipe( fpu_reg_reg );
 4087 %}
 4088 
 4089 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4090   match(Set dst src);
 4091   format %{ "" %}
 4092   ins_encode %{
 4093     ShouldNotReachHere();
 4094   %}
 4095   ins_pipe( fpu_reg_reg );
 4096 %}
 4097 
 4098 // ============================================================================
 4099 
 4100 // Load vectors generic operand pattern
 4101 instruct loadV(vec dst, memory mem) %{
 4102   match(Set dst (LoadVector mem));
 4103   ins_cost(125);
 4104   format %{ "load_vector $dst,$mem" %}
 4105   ins_encode %{
 4106     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4107   %}
 4108   ins_pipe( pipe_slow );
 4109 %}
 4110 
 4111 // Store vectors generic operand pattern.
 4112 instruct storeV(memory mem, vec src) %{
 4113   match(Set mem (StoreVector mem src));
 4114   ins_cost(145);
 4115   format %{ "store_vector $mem,$src\n\t" %}
 4116   ins_encode %{
 4117     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4118       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4119       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4120       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4121       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4122       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4123       default: ShouldNotReachHere();
 4124     }
 4125   %}
 4126   ins_pipe( pipe_slow );
 4127 %}
 4128 
 4129 // ---------------------------------------- Gather ------------------------------------
 4130 
 4131 // Gather INT, LONG, FLOAT, DOUBLE
 4132 
 4133 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4134   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4135   match(Set dst (LoadVectorGather mem idx));
 4136   effect(TEMP dst, TEMP tmp, TEMP mask);
 4137   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4138   ins_encode %{
 4139     assert(UseAVX >= 2, "sanity");
 4140 
 4141     int vlen_enc = vector_length_encoding(this);
 4142     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4143 
 4144     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 4145     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4146 
 4147     if (vlen_enc == Assembler::AVX_128bit) {
 4148       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4149     } else {
 4150       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4151     }
 4152     __ lea($tmp$$Register, $mem$$Address);
 4153     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4154   %}
 4155   ins_pipe( pipe_slow );
 4156 %}
 4157 
 4158 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4159   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4160   match(Set dst (LoadVectorGather mem idx));
 4161   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4162   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4163   ins_encode %{
 4164     assert(UseAVX > 2, "sanity");
 4165 
 4166     int vlen_enc = vector_length_encoding(this);
 4167     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4168 
 4169     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4170 
 4171     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4172     __ lea($tmp$$Register, $mem$$Address);
 4173     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4174   %}
 4175   ins_pipe( pipe_slow );
 4176 %}
 4177 
 4178 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4179   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4180   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4181   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4182   ins_encode %{
 4183     assert(UseAVX > 2, "sanity");
 4184     int vlen_enc = vector_length_encoding(this);
 4185     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4186     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4187     // Note: Since gather instruction partially updates the opmask register used
 4188     // for predication hense moving mask operand to a temporary.
 4189     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4190     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4191     __ lea($tmp$$Register, $mem$$Address);
 4192     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4193   %}
 4194   ins_pipe( pipe_slow );
 4195 %}
 4196 // ====================Scatter=======================================
 4197 
 4198 // Scatter INT, LONG, FLOAT, DOUBLE
 4199 
 4200 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4201   predicate(UseAVX > 2);
 4202   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4203   effect(TEMP tmp, TEMP ktmp);
 4204   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4205   ins_encode %{
 4206     int vlen_enc = vector_length_encoding(this, $src);
 4207     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4208 
 4209     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4210     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4211 
 4212     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4213     __ lea($tmp$$Register, $mem$$Address);
 4214     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4215   %}
 4216   ins_pipe( pipe_slow );
 4217 %}
 4218 
 4219 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4220   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4221   effect(TEMP tmp, TEMP ktmp);
 4222   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4223   ins_encode %{
 4224     int vlen_enc = vector_length_encoding(this, $src);
 4225     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4226     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4227     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4228     // Note: Since scatter instruction partially updates the opmask register used
 4229     // for predication hense moving mask operand to a temporary.
 4230     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4231     __ lea($tmp$$Register, $mem$$Address);
 4232     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4233   %}
 4234   ins_pipe( pipe_slow );
 4235 %}
 4236 
 4237 // ====================REPLICATE=======================================
 4238 
 4239 // Replicate byte scalar to be vector
 4240 instruct vReplB_reg(vec dst, rRegI src) %{
 4241   predicate(UseAVX >= 2);
 4242   match(Set dst (ReplicateB src));
 4243   format %{ "replicateB $dst,$src" %}
 4244   ins_encode %{
 4245     uint vlen = Matcher::vector_length(this);
 4246     int vlen_enc = vector_length_encoding(this);
 4247     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4248       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4249       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4250     } else {
 4251       __ movdl($dst$$XMMRegister, $src$$Register);
 4252       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4253     }
 4254   %}
 4255   ins_pipe( pipe_slow );
 4256 %}
 4257 
 4258 instruct ReplB_reg(vec dst, rRegI src) %{
 4259   predicate(UseAVX < 2);
 4260   match(Set dst (ReplicateB src));
 4261   format %{ "replicateB $dst,$src" %}
 4262   ins_encode %{
 4263     uint vlen = Matcher::vector_length(this);
 4264     __ movdl($dst$$XMMRegister, $src$$Register);
 4265     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4266     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4267     if (vlen >= 16) {
 4268       assert(vlen == 16, "");
 4269       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4270     }
 4271   %}
 4272   ins_pipe( pipe_slow );
 4273 %}
 4274 
 4275 instruct ReplB_mem(vec dst, memory mem) %{
 4276   predicate(UseAVX >= 2);
 4277   match(Set dst (ReplicateB (LoadB mem)));
 4278   format %{ "replicateB $dst,$mem" %}
 4279   ins_encode %{
 4280     int vlen_enc = vector_length_encoding(this);
 4281     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4282   %}
 4283   ins_pipe( pipe_slow );
 4284 %}
 4285 
 4286 // ====================ReplicateS=======================================
 4287 
 4288 instruct vReplS_reg(vec dst, rRegI src) %{
 4289   predicate(UseAVX >= 2);
 4290   match(Set dst (ReplicateS src));
 4291   format %{ "replicateS $dst,$src" %}
 4292   ins_encode %{
 4293     uint vlen = Matcher::vector_length(this);
 4294     int vlen_enc = vector_length_encoding(this);
 4295     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4296       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4297       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4298     } else {
 4299       __ movdl($dst$$XMMRegister, $src$$Register);
 4300       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4301     }
 4302   %}
 4303   ins_pipe( pipe_slow );
 4304 %}
 4305 
 4306 instruct ReplS_reg(vec dst, rRegI src) %{
 4307   predicate(UseAVX < 2);
 4308   match(Set dst (ReplicateS src));
 4309   format %{ "replicateS $dst,$src" %}
 4310   ins_encode %{
 4311     uint vlen = Matcher::vector_length(this);
 4312     int vlen_enc = vector_length_encoding(this);
 4313     __ movdl($dst$$XMMRegister, $src$$Register);
 4314     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4315     if (vlen >= 8) {
 4316       assert(vlen == 8, "");
 4317       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4318     }
 4319   %}
 4320   ins_pipe( pipe_slow );
 4321 %}
 4322 
 4323 instruct ReplS_mem(vec dst, memory mem) %{
 4324   predicate(UseAVX >= 2);
 4325   match(Set dst (ReplicateS (LoadS mem)));
 4326   format %{ "replicateS $dst,$mem" %}
 4327   ins_encode %{
 4328     int vlen_enc = vector_length_encoding(this);
 4329     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4330   %}
 4331   ins_pipe( pipe_slow );
 4332 %}
 4333 
 4334 // ====================ReplicateI=======================================
 4335 
 4336 instruct ReplI_reg(vec dst, rRegI src) %{
 4337   match(Set dst (ReplicateI src));
 4338   format %{ "replicateI $dst,$src" %}
 4339   ins_encode %{
 4340     uint vlen = Matcher::vector_length(this);
 4341     int vlen_enc = vector_length_encoding(this);
 4342     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4343       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4344     } else if (VM_Version::supports_avx2()) {
 4345       __ movdl($dst$$XMMRegister, $src$$Register);
 4346       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4347     } else {
 4348       __ movdl($dst$$XMMRegister, $src$$Register);
 4349       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4350     }
 4351   %}
 4352   ins_pipe( pipe_slow );
 4353 %}
 4354 
 4355 instruct ReplI_mem(vec dst, memory mem) %{
 4356   match(Set dst (ReplicateI (LoadI mem)));
 4357   format %{ "replicateI $dst,$mem" %}
 4358   ins_encode %{
 4359     int vlen_enc = vector_length_encoding(this);
 4360     if (VM_Version::supports_avx2()) {
 4361       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4362     } else if (VM_Version::supports_avx()) {
 4363       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4364     } else {
 4365       __ movdl($dst$$XMMRegister, $mem$$Address);
 4366       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4367     }
 4368   %}
 4369   ins_pipe( pipe_slow );
 4370 %}
 4371 
 4372 instruct ReplI_imm(vec dst, immI con) %{
 4373   match(Set dst (ReplicateB con));
 4374   match(Set dst (ReplicateS con));
 4375   match(Set dst (ReplicateI con));
 4376   format %{ "replicateI $dst,$con" %}
 4377   ins_encode %{
 4378     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4379         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4380             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4381                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4382     BasicType bt = Matcher::vector_element_basic_type(this);
 4383     int vlen = Matcher::vector_length_in_bytes(this);
 4384     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4385   %}
 4386   ins_pipe( pipe_slow );
 4387 %}
 4388 
 4389 // Replicate scalar zero to be vector
 4390 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4391   match(Set dst (ReplicateB zero));
 4392   match(Set dst (ReplicateS zero));
 4393   match(Set dst (ReplicateI zero));
 4394   format %{ "replicateI $dst,$zero" %}
 4395   ins_encode %{
 4396     int vlen_enc = vector_length_encoding(this);
 4397     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4398       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4399     } else {
 4400       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4401     }
 4402   %}
 4403   ins_pipe( fpu_reg_reg );
 4404 %}
 4405 
 4406 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4407   predicate(UseSSE >= 2);
 4408   match(Set dst (ReplicateB con));
 4409   match(Set dst (ReplicateS con));
 4410   match(Set dst (ReplicateI con));
 4411   format %{ "vallones $dst" %}
 4412   ins_encode %{
 4413     int vector_len = vector_length_encoding(this);
 4414     __ vallones($dst$$XMMRegister, vector_len);
 4415   %}
 4416   ins_pipe( pipe_slow );
 4417 %}
 4418 
 4419 // ====================ReplicateL=======================================
 4420 
 4421 #ifdef _LP64
 4422 // Replicate long (8 byte) scalar to be vector
 4423 instruct ReplL_reg(vec dst, rRegL src) %{
 4424   match(Set dst (ReplicateL src));
 4425   format %{ "replicateL $dst,$src" %}
 4426   ins_encode %{
 4427     int vlen = Matcher::vector_length(this);
 4428     int vlen_enc = vector_length_encoding(this);
 4429     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4430       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4431     } else if (VM_Version::supports_avx2()) {
 4432       __ movdq($dst$$XMMRegister, $src$$Register);
 4433       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4434     } else {
 4435       __ movdq($dst$$XMMRegister, $src$$Register);
 4436       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4437     }
 4438   %}
 4439   ins_pipe( pipe_slow );
 4440 %}
 4441 #else // _LP64
 4442 // Replicate long (8 byte) scalar to be vector
 4443 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4444   predicate(Matcher::vector_length(n) <= 4);
 4445   match(Set dst (ReplicateL src));
 4446   effect(TEMP dst, USE src, TEMP tmp);
 4447   format %{ "replicateL $dst,$src" %}
 4448   ins_encode %{
 4449     uint vlen = Matcher::vector_length(this);
 4450     if (vlen == 2) {
 4451       __ movdl($dst$$XMMRegister, $src$$Register);
 4452       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4453       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4454       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4455     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4456       int vlen_enc = Assembler::AVX_256bit;
 4457       __ movdl($dst$$XMMRegister, $src$$Register);
 4458       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4459       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4460       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4461     } else {
 4462       __ movdl($dst$$XMMRegister, $src$$Register);
 4463       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4464       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4465       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4466       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4467     }
 4468   %}
 4469   ins_pipe( pipe_slow );
 4470 %}
 4471 
 4472 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4473   predicate(Matcher::vector_length(n) == 8);
 4474   match(Set dst (ReplicateL src));
 4475   effect(TEMP dst, USE src, TEMP tmp);
 4476   format %{ "replicateL $dst,$src" %}
 4477   ins_encode %{
 4478     if (VM_Version::supports_avx512vl()) {
 4479       __ movdl($dst$$XMMRegister, $src$$Register);
 4480       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4481       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4482       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4483       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4484       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4485     } else {
 4486       int vlen_enc = Assembler::AVX_512bit;
 4487       __ movdl($dst$$XMMRegister, $src$$Register);
 4488       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4489       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4490       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4491     }
 4492   %}
 4493   ins_pipe( pipe_slow );
 4494 %}
 4495 #endif // _LP64
 4496 
 4497 instruct ReplL_mem(vec dst, memory mem) %{
 4498   match(Set dst (ReplicateL (LoadL mem)));
 4499   format %{ "replicateL $dst,$mem" %}
 4500   ins_encode %{
 4501     int vlen_enc = vector_length_encoding(this);
 4502     if (VM_Version::supports_avx2()) {
 4503       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4504     } else if (VM_Version::supports_sse3()) {
 4505       __ movddup($dst$$XMMRegister, $mem$$Address);
 4506     } else {
 4507       __ movq($dst$$XMMRegister, $mem$$Address);
 4508       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4509     }
 4510   %}
 4511   ins_pipe( pipe_slow );
 4512 %}
 4513 
 4514 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4515 instruct ReplL_imm(vec dst, immL con) %{
 4516   match(Set dst (ReplicateL con));
 4517   format %{ "replicateL $dst,$con" %}
 4518   ins_encode %{
 4519     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4520     int vlen = Matcher::vector_length_in_bytes(this);
 4521     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4522   %}
 4523   ins_pipe( pipe_slow );
 4524 %}
 4525 
 4526 instruct ReplL_zero(vec dst, immL0 zero) %{
 4527   match(Set dst (ReplicateL zero));
 4528   format %{ "replicateL $dst,$zero" %}
 4529   ins_encode %{
 4530     int vlen_enc = vector_length_encoding(this);
 4531     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4532       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4533     } else {
 4534       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4535     }
 4536   %}
 4537   ins_pipe( fpu_reg_reg );
 4538 %}
 4539 
 4540 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4541   predicate(UseSSE >= 2);
 4542   match(Set dst (ReplicateL con));
 4543   format %{ "vallones $dst" %}
 4544   ins_encode %{
 4545     int vector_len = vector_length_encoding(this);
 4546     __ vallones($dst$$XMMRegister, vector_len);
 4547   %}
 4548   ins_pipe( pipe_slow );
 4549 %}
 4550 
 4551 // ====================ReplicateF=======================================
 4552 
 4553 instruct vReplF_reg(vec dst, vlRegF src) %{
 4554   predicate(UseAVX > 0);
 4555   match(Set dst (ReplicateF src));
 4556   format %{ "replicateF $dst,$src" %}
 4557   ins_encode %{
 4558     uint vlen = Matcher::vector_length(this);
 4559     int vlen_enc = vector_length_encoding(this);
 4560     if (vlen <= 4) {
 4561       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4562     } else if (VM_Version::supports_avx2()) {
 4563       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4564     } else {
 4565       assert(vlen == 8, "sanity");
 4566       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4567       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4568     }
 4569   %}
 4570   ins_pipe( pipe_slow );
 4571 %}
 4572 
 4573 instruct ReplF_reg(vec dst, vlRegF src) %{
 4574   predicate(UseAVX == 0);
 4575   match(Set dst (ReplicateF src));
 4576   format %{ "replicateF $dst,$src" %}
 4577   ins_encode %{
 4578     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4579   %}
 4580   ins_pipe( pipe_slow );
 4581 %}
 4582 
 4583 instruct ReplF_mem(vec dst, memory mem) %{
 4584   predicate(UseAVX > 0);
 4585   match(Set dst (ReplicateF (LoadF mem)));
 4586   format %{ "replicateF $dst,$mem" %}
 4587   ins_encode %{
 4588     int vlen_enc = vector_length_encoding(this);
 4589     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4590   %}
 4591   ins_pipe( pipe_slow );
 4592 %}
 4593 
 4594 // Replicate float scalar immediate to be vector by loading from const table.
 4595 instruct ReplF_imm(vec dst, immF con) %{
 4596   match(Set dst (ReplicateF con));
 4597   format %{ "replicateF $dst,$con" %}
 4598   ins_encode %{
 4599     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4600         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4601     int vlen = Matcher::vector_length_in_bytes(this);
 4602     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4603   %}
 4604   ins_pipe( pipe_slow );
 4605 %}
 4606 
 4607 instruct ReplF_zero(vec dst, immF0 zero) %{
 4608   match(Set dst (ReplicateF zero));
 4609   format %{ "replicateF $dst,$zero" %}
 4610   ins_encode %{
 4611     int vlen_enc = vector_length_encoding(this);
 4612     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4613       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4614     } else {
 4615       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4616     }
 4617   %}
 4618   ins_pipe( fpu_reg_reg );
 4619 %}
 4620 
 4621 // ====================ReplicateD=======================================
 4622 
 4623 // Replicate double (8 bytes) scalar to be vector
 4624 instruct vReplD_reg(vec dst, vlRegD src) %{
 4625   predicate(UseSSE >= 3);
 4626   match(Set dst (ReplicateD src));
 4627   format %{ "replicateD $dst,$src" %}
 4628   ins_encode %{
 4629     uint vlen = Matcher::vector_length(this);
 4630     int vlen_enc = vector_length_encoding(this);
 4631     if (vlen <= 2) {
 4632       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4633     } else if (VM_Version::supports_avx2()) {
 4634       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4635     } else {
 4636       assert(vlen == 4, "sanity");
 4637       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4638       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4639     }
 4640   %}
 4641   ins_pipe( pipe_slow );
 4642 %}
 4643 
 4644 instruct ReplD_reg(vec dst, vlRegD src) %{
 4645   predicate(UseSSE < 3);
 4646   match(Set dst (ReplicateD src));
 4647   format %{ "replicateD $dst,$src" %}
 4648   ins_encode %{
 4649     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4650   %}
 4651   ins_pipe( pipe_slow );
 4652 %}
 4653 
 4654 instruct ReplD_mem(vec dst, memory mem) %{
 4655   predicate(UseSSE >= 3);
 4656   match(Set dst (ReplicateD (LoadD mem)));
 4657   format %{ "replicateD $dst,$mem" %}
 4658   ins_encode %{
 4659     if (Matcher::vector_length(this) >= 4) {
 4660       int vlen_enc = vector_length_encoding(this);
 4661       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4662     } else {
 4663       __ movddup($dst$$XMMRegister, $mem$$Address);
 4664     }
 4665   %}
 4666   ins_pipe( pipe_slow );
 4667 %}
 4668 
 4669 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4670 instruct ReplD_imm(vec dst, immD con) %{
 4671   match(Set dst (ReplicateD con));
 4672   format %{ "replicateD $dst,$con" %}
 4673   ins_encode %{
 4674     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4675     int vlen = Matcher::vector_length_in_bytes(this);
 4676     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4677   %}
 4678   ins_pipe( pipe_slow );
 4679 %}
 4680 
 4681 instruct ReplD_zero(vec dst, immD0 zero) %{
 4682   match(Set dst (ReplicateD zero));
 4683   format %{ "replicateD $dst,$zero" %}
 4684   ins_encode %{
 4685     int vlen_enc = vector_length_encoding(this);
 4686     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4687       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4688     } else {
 4689       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4690     }
 4691   %}
 4692   ins_pipe( fpu_reg_reg );
 4693 %}
 4694 
 4695 // ====================VECTOR INSERT=======================================
 4696 
 4697 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4698   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4699   match(Set dst (VectorInsert (Binary dst val) idx));
 4700   format %{ "vector_insert $dst,$val,$idx" %}
 4701   ins_encode %{
 4702     assert(UseSSE >= 4, "required");
 4703     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4704 
 4705     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4706 
 4707     assert(is_integral_type(elem_bt), "");
 4708     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4709 
 4710     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4711   %}
 4712   ins_pipe( pipe_slow );
 4713 %}
 4714 
 4715 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4716   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4717   match(Set dst (VectorInsert (Binary src val) idx));
 4718   effect(TEMP vtmp);
 4719   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4720   ins_encode %{
 4721     int vlen_enc = Assembler::AVX_256bit;
 4722     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4723     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4724     int log2epr = log2(elem_per_lane);
 4725 
 4726     assert(is_integral_type(elem_bt), "sanity");
 4727     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4728 
 4729     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4730     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4731     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4732     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4733     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4734   %}
 4735   ins_pipe( pipe_slow );
 4736 %}
 4737 
 4738 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4739   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4740   match(Set dst (VectorInsert (Binary src val) idx));
 4741   effect(TEMP vtmp);
 4742   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4743   ins_encode %{
 4744     assert(UseAVX > 2, "sanity");
 4745 
 4746     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4747     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4748     int log2epr = log2(elem_per_lane);
 4749 
 4750     assert(is_integral_type(elem_bt), "");
 4751     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4752 
 4753     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4754     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4755     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4756     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4757     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4758   %}
 4759   ins_pipe( pipe_slow );
 4760 %}
 4761 
 4762 #ifdef _LP64
 4763 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4764   predicate(Matcher::vector_length(n) == 2);
 4765   match(Set dst (VectorInsert (Binary dst val) idx));
 4766   format %{ "vector_insert $dst,$val,$idx" %}
 4767   ins_encode %{
 4768     assert(UseSSE >= 4, "required");
 4769     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4770     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4771 
 4772     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4773   %}
 4774   ins_pipe( pipe_slow );
 4775 %}
 4776 
 4777 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4778   predicate(Matcher::vector_length(n) == 4);
 4779   match(Set dst (VectorInsert (Binary src val) idx));
 4780   effect(TEMP vtmp);
 4781   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4782   ins_encode %{
 4783     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4784     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4785 
 4786     uint x_idx = $idx$$constant & right_n_bits(1);
 4787     uint y_idx = ($idx$$constant >> 1) & 1;
 4788     int vlen_enc = Assembler::AVX_256bit;
 4789     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4790     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4791     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4792   %}
 4793   ins_pipe( pipe_slow );
 4794 %}
 4795 
 4796 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4797   predicate(Matcher::vector_length(n) == 8);
 4798   match(Set dst (VectorInsert (Binary src val) idx));
 4799   effect(TEMP vtmp);
 4800   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4801   ins_encode %{
 4802     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4803     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4804 
 4805     uint x_idx = $idx$$constant & right_n_bits(1);
 4806     uint y_idx = ($idx$$constant >> 1) & 3;
 4807     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4808     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4809     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4810   %}
 4811   ins_pipe( pipe_slow );
 4812 %}
 4813 #endif
 4814 
 4815 instruct insertF(vec dst, regF val, immU8 idx) %{
 4816   predicate(Matcher::vector_length(n) < 8);
 4817   match(Set dst (VectorInsert (Binary dst val) idx));
 4818   format %{ "vector_insert $dst,$val,$idx" %}
 4819   ins_encode %{
 4820     assert(UseSSE >= 4, "sanity");
 4821 
 4822     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4823     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4824 
 4825     uint x_idx = $idx$$constant & right_n_bits(2);
 4826     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4827   %}
 4828   ins_pipe( pipe_slow );
 4829 %}
 4830 
 4831 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4832   predicate(Matcher::vector_length(n) >= 8);
 4833   match(Set dst (VectorInsert (Binary src val) idx));
 4834   effect(TEMP vtmp);
 4835   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4836   ins_encode %{
 4837     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4838     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4839 
 4840     int vlen = Matcher::vector_length(this);
 4841     uint x_idx = $idx$$constant & right_n_bits(2);
 4842     if (vlen == 8) {
 4843       uint y_idx = ($idx$$constant >> 2) & 1;
 4844       int vlen_enc = Assembler::AVX_256bit;
 4845       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4846       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4847       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4848     } else {
 4849       assert(vlen == 16, "sanity");
 4850       uint y_idx = ($idx$$constant >> 2) & 3;
 4851       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4852       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4853       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4854     }
 4855   %}
 4856   ins_pipe( pipe_slow );
 4857 %}
 4858 
 4859 #ifdef _LP64
 4860 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4861   predicate(Matcher::vector_length(n) == 2);
 4862   match(Set dst (VectorInsert (Binary dst val) idx));
 4863   effect(TEMP tmp);
 4864   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4865   ins_encode %{
 4866     assert(UseSSE >= 4, "sanity");
 4867     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4868     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4869 
 4870     __ movq($tmp$$Register, $val$$XMMRegister);
 4871     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4872   %}
 4873   ins_pipe( pipe_slow );
 4874 %}
 4875 
 4876 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4877   predicate(Matcher::vector_length(n) == 4);
 4878   match(Set dst (VectorInsert (Binary src val) idx));
 4879   effect(TEMP vtmp, TEMP tmp);
 4880   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4881   ins_encode %{
 4882     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4883     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4884 
 4885     uint x_idx = $idx$$constant & right_n_bits(1);
 4886     uint y_idx = ($idx$$constant >> 1) & 1;
 4887     int vlen_enc = Assembler::AVX_256bit;
 4888     __ movq($tmp$$Register, $val$$XMMRegister);
 4889     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4890     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4891     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4892   %}
 4893   ins_pipe( pipe_slow );
 4894 %}
 4895 
 4896 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4897   predicate(Matcher::vector_length(n) == 8);
 4898   match(Set dst (VectorInsert (Binary src val) idx));
 4899   effect(TEMP tmp, TEMP vtmp);
 4900   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4901   ins_encode %{
 4902     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4903     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4904 
 4905     uint x_idx = $idx$$constant & right_n_bits(1);
 4906     uint y_idx = ($idx$$constant >> 1) & 3;
 4907     __ movq($tmp$$Register, $val$$XMMRegister);
 4908     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4909     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4910     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4911   %}
 4912   ins_pipe( pipe_slow );
 4913 %}
 4914 #endif
 4915 
 4916 // ====================REDUCTION ARITHMETIC=======================================
 4917 
 4918 // =======================Int Reduction==========================================
 4919 
 4920 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4921   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4922   match(Set dst (AddReductionVI src1 src2));
 4923   match(Set dst (MulReductionVI src1 src2));
 4924   match(Set dst (AndReductionV  src1 src2));
 4925   match(Set dst ( OrReductionV  src1 src2));
 4926   match(Set dst (XorReductionV  src1 src2));
 4927   match(Set dst (MinReductionV  src1 src2));
 4928   match(Set dst (MaxReductionV  src1 src2));
 4929   effect(TEMP vtmp1, TEMP vtmp2);
 4930   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4931   ins_encode %{
 4932     int opcode = this->ideal_Opcode();
 4933     int vlen = Matcher::vector_length(this, $src2);
 4934     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4935   %}
 4936   ins_pipe( pipe_slow );
 4937 %}
 4938 
 4939 // =======================Long Reduction==========================================
 4940 
 4941 #ifdef _LP64
 4942 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4943   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4944   match(Set dst (AddReductionVL src1 src2));
 4945   match(Set dst (MulReductionVL src1 src2));
 4946   match(Set dst (AndReductionV  src1 src2));
 4947   match(Set dst ( OrReductionV  src1 src2));
 4948   match(Set dst (XorReductionV  src1 src2));
 4949   match(Set dst (MinReductionV  src1 src2));
 4950   match(Set dst (MaxReductionV  src1 src2));
 4951   effect(TEMP vtmp1, TEMP vtmp2);
 4952   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4953   ins_encode %{
 4954     int opcode = this->ideal_Opcode();
 4955     int vlen = Matcher::vector_length(this, $src2);
 4956     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4957   %}
 4958   ins_pipe( pipe_slow );
 4959 %}
 4960 
 4961 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4962   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4963   match(Set dst (AddReductionVL src1 src2));
 4964   match(Set dst (MulReductionVL src1 src2));
 4965   match(Set dst (AndReductionV  src1 src2));
 4966   match(Set dst ( OrReductionV  src1 src2));
 4967   match(Set dst (XorReductionV  src1 src2));
 4968   match(Set dst (MinReductionV  src1 src2));
 4969   match(Set dst (MaxReductionV  src1 src2));
 4970   effect(TEMP vtmp1, TEMP vtmp2);
 4971   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4972   ins_encode %{
 4973     int opcode = this->ideal_Opcode();
 4974     int vlen = Matcher::vector_length(this, $src2);
 4975     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4976   %}
 4977   ins_pipe( pipe_slow );
 4978 %}
 4979 #endif // _LP64
 4980 
 4981 // =======================Float Reduction==========================================
 4982 
 4983 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4984   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4985   match(Set dst (AddReductionVF dst src));
 4986   match(Set dst (MulReductionVF dst src));
 4987   effect(TEMP dst, TEMP vtmp);
 4988   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4989   ins_encode %{
 4990     int opcode = this->ideal_Opcode();
 4991     int vlen = Matcher::vector_length(this, $src);
 4992     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4993   %}
 4994   ins_pipe( pipe_slow );
 4995 %}
 4996 
 4997 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4998   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4999   match(Set dst (AddReductionVF dst src));
 5000   match(Set dst (MulReductionVF dst src));
 5001   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5002   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5003   ins_encode %{
 5004     int opcode = this->ideal_Opcode();
 5005     int vlen = Matcher::vector_length(this, $src);
 5006     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5007   %}
 5008   ins_pipe( pipe_slow );
 5009 %}
 5010 
 5011 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5012   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 5013   match(Set dst (AddReductionVF dst src));
 5014   match(Set dst (MulReductionVF dst src));
 5015   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5016   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5017   ins_encode %{
 5018     int opcode = this->ideal_Opcode();
 5019     int vlen = Matcher::vector_length(this, $src);
 5020     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5021   %}
 5022   ins_pipe( pipe_slow );
 5023 %}
 5024 
 5025 // =======================Double Reduction==========================================
 5026 
 5027 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5028   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 5029   match(Set dst (AddReductionVD dst src));
 5030   match(Set dst (MulReductionVD dst src));
 5031   effect(TEMP dst, TEMP vtmp);
 5032   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5033   ins_encode %{
 5034     int opcode = this->ideal_Opcode();
 5035     int vlen = Matcher::vector_length(this, $src);
 5036     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5037 %}
 5038   ins_pipe( pipe_slow );
 5039 %}
 5040 
 5041 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5042   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 5043   match(Set dst (AddReductionVD dst src));
 5044   match(Set dst (MulReductionVD dst src));
 5045   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5046   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5047   ins_encode %{
 5048     int opcode = this->ideal_Opcode();
 5049     int vlen = Matcher::vector_length(this, $src);
 5050     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5051   %}
 5052   ins_pipe( pipe_slow );
 5053 %}
 5054 
 5055 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5056   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 5057   match(Set dst (AddReductionVD dst src));
 5058   match(Set dst (MulReductionVD dst src));
 5059   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5060   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5061   ins_encode %{
 5062     int opcode = this->ideal_Opcode();
 5063     int vlen = Matcher::vector_length(this, $src);
 5064     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5065   %}
 5066   ins_pipe( pipe_slow );
 5067 %}
 5068 
 5069 // =======================Byte Reduction==========================================
 5070 
 5071 #ifdef _LP64
 5072 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5073   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5074   match(Set dst (AddReductionVI src1 src2));
 5075   match(Set dst (AndReductionV  src1 src2));
 5076   match(Set dst ( OrReductionV  src1 src2));
 5077   match(Set dst (XorReductionV  src1 src2));
 5078   match(Set dst (MinReductionV  src1 src2));
 5079   match(Set dst (MaxReductionV  src1 src2));
 5080   effect(TEMP vtmp1, TEMP vtmp2);
 5081   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5082   ins_encode %{
 5083     int opcode = this->ideal_Opcode();
 5084     int vlen = Matcher::vector_length(this, $src2);
 5085     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5086   %}
 5087   ins_pipe( pipe_slow );
 5088 %}
 5089 
 5090 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5091   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5092   match(Set dst (AddReductionVI src1 src2));
 5093   match(Set dst (AndReductionV  src1 src2));
 5094   match(Set dst ( OrReductionV  src1 src2));
 5095   match(Set dst (XorReductionV  src1 src2));
 5096   match(Set dst (MinReductionV  src1 src2));
 5097   match(Set dst (MaxReductionV  src1 src2));
 5098   effect(TEMP vtmp1, TEMP vtmp2);
 5099   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5100   ins_encode %{
 5101     int opcode = this->ideal_Opcode();
 5102     int vlen = Matcher::vector_length(this, $src2);
 5103     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5104   %}
 5105   ins_pipe( pipe_slow );
 5106 %}
 5107 #endif
 5108 
 5109 // =======================Short Reduction==========================================
 5110 
 5111 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5112   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5113   match(Set dst (AddReductionVI src1 src2));
 5114   match(Set dst (MulReductionVI src1 src2));
 5115   match(Set dst (AndReductionV  src1 src2));
 5116   match(Set dst ( OrReductionV  src1 src2));
 5117   match(Set dst (XorReductionV  src1 src2));
 5118   match(Set dst (MinReductionV  src1 src2));
 5119   match(Set dst (MaxReductionV  src1 src2));
 5120   effect(TEMP vtmp1, TEMP vtmp2);
 5121   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5122   ins_encode %{
 5123     int opcode = this->ideal_Opcode();
 5124     int vlen = Matcher::vector_length(this, $src2);
 5125     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5126   %}
 5127   ins_pipe( pipe_slow );
 5128 %}
 5129 
 5130 // =======================Mul Reduction==========================================
 5131 
 5132 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5133   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5134             Matcher::vector_length(n->in(2)) <= 32); // src2
 5135   match(Set dst (MulReductionVI src1 src2));
 5136   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5137   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5138   ins_encode %{
 5139     int opcode = this->ideal_Opcode();
 5140     int vlen = Matcher::vector_length(this, $src2);
 5141     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5142   %}
 5143   ins_pipe( pipe_slow );
 5144 %}
 5145 
 5146 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5147   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5148             Matcher::vector_length(n->in(2)) == 64); // src2
 5149   match(Set dst (MulReductionVI src1 src2));
 5150   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5151   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5152   ins_encode %{
 5153     int opcode = this->ideal_Opcode();
 5154     int vlen = Matcher::vector_length(this, $src2);
 5155     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5156   %}
 5157   ins_pipe( pipe_slow );
 5158 %}
 5159 
 5160 //--------------------Min/Max Float Reduction --------------------
 5161 // Float Min Reduction
 5162 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5163                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5164   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5165             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5166              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5167             Matcher::vector_length(n->in(2)) == 2);
 5168   match(Set dst (MinReductionV src1 src2));
 5169   match(Set dst (MaxReductionV src1 src2));
 5170   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5171   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5172   ins_encode %{
 5173     assert(UseAVX > 0, "sanity");
 5174 
 5175     int opcode = this->ideal_Opcode();
 5176     int vlen = Matcher::vector_length(this, $src2);
 5177     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5178                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5179   %}
 5180   ins_pipe( pipe_slow );
 5181 %}
 5182 
 5183 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5184                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5185   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5186             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5187              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5188             Matcher::vector_length(n->in(2)) >= 4);
 5189   match(Set dst (MinReductionV src1 src2));
 5190   match(Set dst (MaxReductionV src1 src2));
 5191   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5192   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5193   ins_encode %{
 5194     assert(UseAVX > 0, "sanity");
 5195 
 5196     int opcode = this->ideal_Opcode();
 5197     int vlen = Matcher::vector_length(this, $src2);
 5198     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5199                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5200   %}
 5201   ins_pipe( pipe_slow );
 5202 %}
 5203 
 5204 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5205                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5206   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5207             Matcher::vector_length(n->in(2)) == 2);
 5208   match(Set dst (MinReductionV dst src));
 5209   match(Set dst (MaxReductionV dst src));
 5210   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5211   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5212   ins_encode %{
 5213     assert(UseAVX > 0, "sanity");
 5214 
 5215     int opcode = this->ideal_Opcode();
 5216     int vlen = Matcher::vector_length(this, $src);
 5217     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5218                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5219   %}
 5220   ins_pipe( pipe_slow );
 5221 %}
 5222 
 5223 
 5224 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5225                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5226   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5227             Matcher::vector_length(n->in(2)) >= 4);
 5228   match(Set dst (MinReductionV dst src));
 5229   match(Set dst (MaxReductionV dst src));
 5230   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5231   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5232   ins_encode %{
 5233     assert(UseAVX > 0, "sanity");
 5234 
 5235     int opcode = this->ideal_Opcode();
 5236     int vlen = Matcher::vector_length(this, $src);
 5237     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5238                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5239   %}
 5240   ins_pipe( pipe_slow );
 5241 %}
 5242 
 5243 
 5244 //--------------------Min Double Reduction --------------------
 5245 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5246                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5247                             rFlagsReg cr) %{
 5248   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5249             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5250              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5251             Matcher::vector_length(n->in(2)) == 2);
 5252   match(Set dst (MinReductionV src1 src2));
 5253   match(Set dst (MaxReductionV src1 src2));
 5254   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5255   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5256   ins_encode %{
 5257     assert(UseAVX > 0, "sanity");
 5258 
 5259     int opcode = this->ideal_Opcode();
 5260     int vlen = Matcher::vector_length(this, $src2);
 5261     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5262                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5263   %}
 5264   ins_pipe( pipe_slow );
 5265 %}
 5266 
 5267 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5268                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5269                            rFlagsReg cr) %{
 5270   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5271             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5272              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5273             Matcher::vector_length(n->in(2)) >= 4);
 5274   match(Set dst (MinReductionV src1 src2));
 5275   match(Set dst (MaxReductionV src1 src2));
 5276   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5277   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5278   ins_encode %{
 5279     assert(UseAVX > 0, "sanity");
 5280 
 5281     int opcode = this->ideal_Opcode();
 5282     int vlen = Matcher::vector_length(this, $src2);
 5283     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5284                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5285   %}
 5286   ins_pipe( pipe_slow );
 5287 %}
 5288 
 5289 
 5290 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5291                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5292                                rFlagsReg cr) %{
 5293   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5294             Matcher::vector_length(n->in(2)) == 2);
 5295   match(Set dst (MinReductionV dst src));
 5296   match(Set dst (MaxReductionV dst src));
 5297   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5298   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5299   ins_encode %{
 5300     assert(UseAVX > 0, "sanity");
 5301 
 5302     int opcode = this->ideal_Opcode();
 5303     int vlen = Matcher::vector_length(this, $src);
 5304     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5305                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5306   %}
 5307   ins_pipe( pipe_slow );
 5308 %}
 5309 
 5310 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5311                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5312                               rFlagsReg cr) %{
 5313   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5314             Matcher::vector_length(n->in(2)) >= 4);
 5315   match(Set dst (MinReductionV dst src));
 5316   match(Set dst (MaxReductionV dst src));
 5317   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5318   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5319   ins_encode %{
 5320     assert(UseAVX > 0, "sanity");
 5321 
 5322     int opcode = this->ideal_Opcode();
 5323     int vlen = Matcher::vector_length(this, $src);
 5324     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5325                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5326   %}
 5327   ins_pipe( pipe_slow );
 5328 %}
 5329 
 5330 // ====================VECTOR ARITHMETIC=======================================
 5331 
 5332 // --------------------------------- ADD --------------------------------------
 5333 
 5334 // Bytes vector add
 5335 instruct vaddB(vec dst, vec src) %{
 5336   predicate(UseAVX == 0);
 5337   match(Set dst (AddVB dst src));
 5338   format %{ "paddb   $dst,$src\t! add packedB" %}
 5339   ins_encode %{
 5340     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5341   %}
 5342   ins_pipe( pipe_slow );
 5343 %}
 5344 
 5345 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5346   predicate(UseAVX > 0);
 5347   match(Set dst (AddVB src1 src2));
 5348   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5349   ins_encode %{
 5350     int vlen_enc = vector_length_encoding(this);
 5351     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5352   %}
 5353   ins_pipe( pipe_slow );
 5354 %}
 5355 
 5356 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5357   predicate((UseAVX > 0) &&
 5358             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5359   match(Set dst (AddVB src (LoadVector mem)));
 5360   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5361   ins_encode %{
 5362     int vlen_enc = vector_length_encoding(this);
 5363     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5364   %}
 5365   ins_pipe( pipe_slow );
 5366 %}
 5367 
 5368 // Shorts/Chars vector add
 5369 instruct vaddS(vec dst, vec src) %{
 5370   predicate(UseAVX == 0);
 5371   match(Set dst (AddVS dst src));
 5372   format %{ "paddw   $dst,$src\t! add packedS" %}
 5373   ins_encode %{
 5374     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5375   %}
 5376   ins_pipe( pipe_slow );
 5377 %}
 5378 
 5379 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5380   predicate(UseAVX > 0);
 5381   match(Set dst (AddVS src1 src2));
 5382   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5383   ins_encode %{
 5384     int vlen_enc = vector_length_encoding(this);
 5385     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5386   %}
 5387   ins_pipe( pipe_slow );
 5388 %}
 5389 
 5390 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5391   predicate((UseAVX > 0) &&
 5392             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5393   match(Set dst (AddVS src (LoadVector mem)));
 5394   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5395   ins_encode %{
 5396     int vlen_enc = vector_length_encoding(this);
 5397     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5398   %}
 5399   ins_pipe( pipe_slow );
 5400 %}
 5401 
 5402 // Integers vector add
 5403 instruct vaddI(vec dst, vec src) %{
 5404   predicate(UseAVX == 0);
 5405   match(Set dst (AddVI dst src));
 5406   format %{ "paddd   $dst,$src\t! add packedI" %}
 5407   ins_encode %{
 5408     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5409   %}
 5410   ins_pipe( pipe_slow );
 5411 %}
 5412 
 5413 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5414   predicate(UseAVX > 0);
 5415   match(Set dst (AddVI src1 src2));
 5416   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5417   ins_encode %{
 5418     int vlen_enc = vector_length_encoding(this);
 5419     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5420   %}
 5421   ins_pipe( pipe_slow );
 5422 %}
 5423 
 5424 
 5425 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5426   predicate((UseAVX > 0) &&
 5427             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5428   match(Set dst (AddVI src (LoadVector mem)));
 5429   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5430   ins_encode %{
 5431     int vlen_enc = vector_length_encoding(this);
 5432     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5433   %}
 5434   ins_pipe( pipe_slow );
 5435 %}
 5436 
 5437 // Longs vector add
 5438 instruct vaddL(vec dst, vec src) %{
 5439   predicate(UseAVX == 0);
 5440   match(Set dst (AddVL dst src));
 5441   format %{ "paddq   $dst,$src\t! add packedL" %}
 5442   ins_encode %{
 5443     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5444   %}
 5445   ins_pipe( pipe_slow );
 5446 %}
 5447 
 5448 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5449   predicate(UseAVX > 0);
 5450   match(Set dst (AddVL src1 src2));
 5451   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5452   ins_encode %{
 5453     int vlen_enc = vector_length_encoding(this);
 5454     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5455   %}
 5456   ins_pipe( pipe_slow );
 5457 %}
 5458 
 5459 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5460   predicate((UseAVX > 0) &&
 5461             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5462   match(Set dst (AddVL src (LoadVector mem)));
 5463   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5464   ins_encode %{
 5465     int vlen_enc = vector_length_encoding(this);
 5466     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5467   %}
 5468   ins_pipe( pipe_slow );
 5469 %}
 5470 
 5471 // Floats vector add
 5472 instruct vaddF(vec dst, vec src) %{
 5473   predicate(UseAVX == 0);
 5474   match(Set dst (AddVF dst src));
 5475   format %{ "addps   $dst,$src\t! add packedF" %}
 5476   ins_encode %{
 5477     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5478   %}
 5479   ins_pipe( pipe_slow );
 5480 %}
 5481 
 5482 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5483   predicate(UseAVX > 0);
 5484   match(Set dst (AddVF src1 src2));
 5485   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5486   ins_encode %{
 5487     int vlen_enc = vector_length_encoding(this);
 5488     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5489   %}
 5490   ins_pipe( pipe_slow );
 5491 %}
 5492 
 5493 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5494   predicate((UseAVX > 0) &&
 5495             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5496   match(Set dst (AddVF src (LoadVector mem)));
 5497   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5498   ins_encode %{
 5499     int vlen_enc = vector_length_encoding(this);
 5500     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5501   %}
 5502   ins_pipe( pipe_slow );
 5503 %}
 5504 
 5505 // Doubles vector add
 5506 instruct vaddD(vec dst, vec src) %{
 5507   predicate(UseAVX == 0);
 5508   match(Set dst (AddVD dst src));
 5509   format %{ "addpd   $dst,$src\t! add packedD" %}
 5510   ins_encode %{
 5511     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5512   %}
 5513   ins_pipe( pipe_slow );
 5514 %}
 5515 
 5516 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5517   predicate(UseAVX > 0);
 5518   match(Set dst (AddVD src1 src2));
 5519   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5520   ins_encode %{
 5521     int vlen_enc = vector_length_encoding(this);
 5522     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5523   %}
 5524   ins_pipe( pipe_slow );
 5525 %}
 5526 
 5527 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5528   predicate((UseAVX > 0) &&
 5529             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5530   match(Set dst (AddVD src (LoadVector mem)));
 5531   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5532   ins_encode %{
 5533     int vlen_enc = vector_length_encoding(this);
 5534     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5535   %}
 5536   ins_pipe( pipe_slow );
 5537 %}
 5538 
 5539 // --------------------------------- SUB --------------------------------------
 5540 
 5541 // Bytes vector sub
 5542 instruct vsubB(vec dst, vec src) %{
 5543   predicate(UseAVX == 0);
 5544   match(Set dst (SubVB dst src));
 5545   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5546   ins_encode %{
 5547     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5548   %}
 5549   ins_pipe( pipe_slow );
 5550 %}
 5551 
 5552 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5553   predicate(UseAVX > 0);
 5554   match(Set dst (SubVB src1 src2));
 5555   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5556   ins_encode %{
 5557     int vlen_enc = vector_length_encoding(this);
 5558     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5559   %}
 5560   ins_pipe( pipe_slow );
 5561 %}
 5562 
 5563 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5564   predicate((UseAVX > 0) &&
 5565             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5566   match(Set dst (SubVB src (LoadVector mem)));
 5567   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5568   ins_encode %{
 5569     int vlen_enc = vector_length_encoding(this);
 5570     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5571   %}
 5572   ins_pipe( pipe_slow );
 5573 %}
 5574 
 5575 // Shorts/Chars vector sub
 5576 instruct vsubS(vec dst, vec src) %{
 5577   predicate(UseAVX == 0);
 5578   match(Set dst (SubVS dst src));
 5579   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5580   ins_encode %{
 5581     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5582   %}
 5583   ins_pipe( pipe_slow );
 5584 %}
 5585 
 5586 
 5587 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5588   predicate(UseAVX > 0);
 5589   match(Set dst (SubVS src1 src2));
 5590   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5591   ins_encode %{
 5592     int vlen_enc = vector_length_encoding(this);
 5593     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5594   %}
 5595   ins_pipe( pipe_slow );
 5596 %}
 5597 
 5598 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5599   predicate((UseAVX > 0) &&
 5600             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5601   match(Set dst (SubVS src (LoadVector mem)));
 5602   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5603   ins_encode %{
 5604     int vlen_enc = vector_length_encoding(this);
 5605     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5606   %}
 5607   ins_pipe( pipe_slow );
 5608 %}
 5609 
 5610 // Integers vector sub
 5611 instruct vsubI(vec dst, vec src) %{
 5612   predicate(UseAVX == 0);
 5613   match(Set dst (SubVI dst src));
 5614   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5615   ins_encode %{
 5616     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5617   %}
 5618   ins_pipe( pipe_slow );
 5619 %}
 5620 
 5621 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5622   predicate(UseAVX > 0);
 5623   match(Set dst (SubVI src1 src2));
 5624   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5625   ins_encode %{
 5626     int vlen_enc = vector_length_encoding(this);
 5627     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5628   %}
 5629   ins_pipe( pipe_slow );
 5630 %}
 5631 
 5632 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5633   predicate((UseAVX > 0) &&
 5634             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5635   match(Set dst (SubVI src (LoadVector mem)));
 5636   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5637   ins_encode %{
 5638     int vlen_enc = vector_length_encoding(this);
 5639     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5640   %}
 5641   ins_pipe( pipe_slow );
 5642 %}
 5643 
 5644 // Longs vector sub
 5645 instruct vsubL(vec dst, vec src) %{
 5646   predicate(UseAVX == 0);
 5647   match(Set dst (SubVL dst src));
 5648   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5649   ins_encode %{
 5650     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5651   %}
 5652   ins_pipe( pipe_slow );
 5653 %}
 5654 
 5655 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5656   predicate(UseAVX > 0);
 5657   match(Set dst (SubVL src1 src2));
 5658   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5659   ins_encode %{
 5660     int vlen_enc = vector_length_encoding(this);
 5661     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5662   %}
 5663   ins_pipe( pipe_slow );
 5664 %}
 5665 
 5666 
 5667 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5668   predicate((UseAVX > 0) &&
 5669             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5670   match(Set dst (SubVL src (LoadVector mem)));
 5671   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5672   ins_encode %{
 5673     int vlen_enc = vector_length_encoding(this);
 5674     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5675   %}
 5676   ins_pipe( pipe_slow );
 5677 %}
 5678 
 5679 // Floats vector sub
 5680 instruct vsubF(vec dst, vec src) %{
 5681   predicate(UseAVX == 0);
 5682   match(Set dst (SubVF dst src));
 5683   format %{ "subps   $dst,$src\t! sub packedF" %}
 5684   ins_encode %{
 5685     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5686   %}
 5687   ins_pipe( pipe_slow );
 5688 %}
 5689 
 5690 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5691   predicate(UseAVX > 0);
 5692   match(Set dst (SubVF src1 src2));
 5693   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5694   ins_encode %{
 5695     int vlen_enc = vector_length_encoding(this);
 5696     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5697   %}
 5698   ins_pipe( pipe_slow );
 5699 %}
 5700 
 5701 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5702   predicate((UseAVX > 0) &&
 5703             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5704   match(Set dst (SubVF src (LoadVector mem)));
 5705   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5706   ins_encode %{
 5707     int vlen_enc = vector_length_encoding(this);
 5708     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5709   %}
 5710   ins_pipe( pipe_slow );
 5711 %}
 5712 
 5713 // Doubles vector sub
 5714 instruct vsubD(vec dst, vec src) %{
 5715   predicate(UseAVX == 0);
 5716   match(Set dst (SubVD dst src));
 5717   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5718   ins_encode %{
 5719     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5720   %}
 5721   ins_pipe( pipe_slow );
 5722 %}
 5723 
 5724 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5725   predicate(UseAVX > 0);
 5726   match(Set dst (SubVD src1 src2));
 5727   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5728   ins_encode %{
 5729     int vlen_enc = vector_length_encoding(this);
 5730     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5731   %}
 5732   ins_pipe( pipe_slow );
 5733 %}
 5734 
 5735 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5736   predicate((UseAVX > 0) &&
 5737             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5738   match(Set dst (SubVD src (LoadVector mem)));
 5739   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5740   ins_encode %{
 5741     int vlen_enc = vector_length_encoding(this);
 5742     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5743   %}
 5744   ins_pipe( pipe_slow );
 5745 %}
 5746 
 5747 // --------------------------------- MUL --------------------------------------
 5748 
 5749 // Byte vector mul
 5750 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5751   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5752   match(Set dst (MulVB src1 src2));
 5753   effect(TEMP dst, TEMP xtmp);
 5754   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5755   ins_encode %{
 5756     assert(UseSSE > 3, "required");
 5757     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5758     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5759     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5760     __ psllw($dst$$XMMRegister, 8);
 5761     __ psrlw($dst$$XMMRegister, 8);
 5762     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5763   %}
 5764   ins_pipe( pipe_slow );
 5765 %}
 5766 
 5767 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5768   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5769   match(Set dst (MulVB src1 src2));
 5770   effect(TEMP dst, TEMP xtmp);
 5771   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5772   ins_encode %{
 5773     assert(UseSSE > 3, "required");
 5774     // Odd-index elements
 5775     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5776     __ psrlw($dst$$XMMRegister, 8);
 5777     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5778     __ psrlw($xtmp$$XMMRegister, 8);
 5779     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5780     __ psllw($dst$$XMMRegister, 8);
 5781     // Even-index elements
 5782     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5783     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5784     __ psllw($xtmp$$XMMRegister, 8);
 5785     __ psrlw($xtmp$$XMMRegister, 8);
 5786     // Combine
 5787     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5788   %}
 5789   ins_pipe( pipe_slow );
 5790 %}
 5791 
 5792 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5793   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5794   match(Set dst (MulVB src1 src2));
 5795   effect(TEMP xtmp1, TEMP xtmp2);
 5796   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5797   ins_encode %{
 5798     int vlen_enc = vector_length_encoding(this);
 5799     // Odd-index elements
 5800     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5801     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5802     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5803     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5804     // Even-index elements
 5805     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5806     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5807     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5808     // Combine
 5809     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5810   %}
 5811   ins_pipe( pipe_slow );
 5812 %}
 5813 
 5814 // Shorts/Chars vector mul
 5815 instruct vmulS(vec dst, vec src) %{
 5816   predicate(UseAVX == 0);
 5817   match(Set dst (MulVS dst src));
 5818   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5819   ins_encode %{
 5820     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5821   %}
 5822   ins_pipe( pipe_slow );
 5823 %}
 5824 
 5825 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5826   predicate(UseAVX > 0);
 5827   match(Set dst (MulVS src1 src2));
 5828   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5829   ins_encode %{
 5830     int vlen_enc = vector_length_encoding(this);
 5831     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5832   %}
 5833   ins_pipe( pipe_slow );
 5834 %}
 5835 
 5836 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5837   predicate((UseAVX > 0) &&
 5838             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5839   match(Set dst (MulVS src (LoadVector mem)));
 5840   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5841   ins_encode %{
 5842     int vlen_enc = vector_length_encoding(this);
 5843     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5844   %}
 5845   ins_pipe( pipe_slow );
 5846 %}
 5847 
 5848 // Integers vector mul
 5849 instruct vmulI(vec dst, vec src) %{
 5850   predicate(UseAVX == 0);
 5851   match(Set dst (MulVI dst src));
 5852   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5853   ins_encode %{
 5854     assert(UseSSE > 3, "required");
 5855     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5856   %}
 5857   ins_pipe( pipe_slow );
 5858 %}
 5859 
 5860 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5861   predicate(UseAVX > 0);
 5862   match(Set dst (MulVI src1 src2));
 5863   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5864   ins_encode %{
 5865     int vlen_enc = vector_length_encoding(this);
 5866     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5867   %}
 5868   ins_pipe( pipe_slow );
 5869 %}
 5870 
 5871 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5872   predicate((UseAVX > 0) &&
 5873             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5874   match(Set dst (MulVI src (LoadVector mem)));
 5875   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5876   ins_encode %{
 5877     int vlen_enc = vector_length_encoding(this);
 5878     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5879   %}
 5880   ins_pipe( pipe_slow );
 5881 %}
 5882 
 5883 // Longs vector mul
 5884 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 5885   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5886              VM_Version::supports_avx512dq()) ||
 5887             VM_Version::supports_avx512vldq());
 5888   match(Set dst (MulVL src1 src2));
 5889   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 5890   ins_encode %{
 5891     assert(UseAVX > 2, "required");
 5892     int vlen_enc = vector_length_encoding(this);
 5893     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5894   %}
 5895   ins_pipe( pipe_slow );
 5896 %}
 5897 
 5898 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 5899   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5900              VM_Version::supports_avx512dq()) ||
 5901             (Matcher::vector_length_in_bytes(n) > 8 &&
 5902              VM_Version::supports_avx512vldq()));
 5903   match(Set dst (MulVL src (LoadVector mem)));
 5904   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 5905   ins_encode %{
 5906     assert(UseAVX > 2, "required");
 5907     int vlen_enc = vector_length_encoding(this);
 5908     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5909   %}
 5910   ins_pipe( pipe_slow );
 5911 %}
 5912 
 5913 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 5914   predicate(UseAVX == 0);
 5915   match(Set dst (MulVL src1 src2));
 5916   effect(TEMP dst, TEMP xtmp);
 5917   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5918   ins_encode %{
 5919     assert(VM_Version::supports_sse4_1(), "required");
 5920     // Get the lo-hi products, only the lower 32 bits is in concerns
 5921     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 5922     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 5923     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 5924     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 5925     __ psllq($dst$$XMMRegister, 32);
 5926     // Get the lo-lo products
 5927     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5928     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 5929     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 5930   %}
 5931   ins_pipe( pipe_slow );
 5932 %}
 5933 
 5934 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5935   predicate(UseAVX > 0 &&
 5936             ((Matcher::vector_length_in_bytes(n) == 64 &&
 5937               !VM_Version::supports_avx512dq()) ||
 5938              (Matcher::vector_length_in_bytes(n) < 64 &&
 5939               !VM_Version::supports_avx512vldq())));
 5940   match(Set dst (MulVL src1 src2));
 5941   effect(TEMP xtmp1, TEMP xtmp2);
 5942   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5943   ins_encode %{
 5944     int vlen_enc = vector_length_encoding(this);
 5945     // Get the lo-hi products, only the lower 32 bits is in concerns
 5946     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 5947     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5948     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 5949     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5950     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 5951     // Get the lo-lo products
 5952     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5953     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5954   %}
 5955   ins_pipe( pipe_slow );
 5956 %}
 5957 
 5958 // Floats vector mul
 5959 instruct vmulF(vec dst, vec src) %{
 5960   predicate(UseAVX == 0);
 5961   match(Set dst (MulVF dst src));
 5962   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5963   ins_encode %{
 5964     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5965   %}
 5966   ins_pipe( pipe_slow );
 5967 %}
 5968 
 5969 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5970   predicate(UseAVX > 0);
 5971   match(Set dst (MulVF src1 src2));
 5972   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5973   ins_encode %{
 5974     int vlen_enc = vector_length_encoding(this);
 5975     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5976   %}
 5977   ins_pipe( pipe_slow );
 5978 %}
 5979 
 5980 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5981   predicate((UseAVX > 0) &&
 5982             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5983   match(Set dst (MulVF src (LoadVector mem)));
 5984   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5985   ins_encode %{
 5986     int vlen_enc = vector_length_encoding(this);
 5987     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5988   %}
 5989   ins_pipe( pipe_slow );
 5990 %}
 5991 
 5992 // Doubles vector mul
 5993 instruct vmulD(vec dst, vec src) %{
 5994   predicate(UseAVX == 0);
 5995   match(Set dst (MulVD dst src));
 5996   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5997   ins_encode %{
 5998     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5999   %}
 6000   ins_pipe( pipe_slow );
 6001 %}
 6002 
 6003 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6004   predicate(UseAVX > 0);
 6005   match(Set dst (MulVD src1 src2));
 6006   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6007   ins_encode %{
 6008     int vlen_enc = vector_length_encoding(this);
 6009     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6010   %}
 6011   ins_pipe( pipe_slow );
 6012 %}
 6013 
 6014 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6015   predicate((UseAVX > 0) &&
 6016             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6017   match(Set dst (MulVD src (LoadVector mem)));
 6018   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6019   ins_encode %{
 6020     int vlen_enc = vector_length_encoding(this);
 6021     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6022   %}
 6023   ins_pipe( pipe_slow );
 6024 %}
 6025 
 6026 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 6027   predicate(Matcher::vector_length(n) == 8);
 6028   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
 6029   effect(TEMP dst, USE src1, USE src2);
 6030   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
 6031             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
 6032          %}
 6033   ins_encode %{
 6034     assert(UseAVX > 0, "required");
 6035 
 6036     int vlen_enc = Assembler::AVX_256bit;
 6037     int cond = (Assembler::Condition)($copnd$$cmpcode);
 6038     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 6039     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6040   %}
 6041   ins_pipe( pipe_slow );
 6042 %}
 6043 
 6044 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 6045   predicate(Matcher::vector_length(n) == 4);
 6046   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
 6047   effect(TEMP dst, USE src1, USE src2);
 6048   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
 6049             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
 6050          %}
 6051   ins_encode %{
 6052     assert(UseAVX > 0, "required");
 6053 
 6054     int vlen_enc = Assembler::AVX_256bit;
 6055     int cond = (Assembler::Condition)($copnd$$cmpcode);
 6056     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 6057     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6058   %}
 6059   ins_pipe( pipe_slow );
 6060 %}
 6061 
 6062 // --------------------------------- DIV --------------------------------------
 6063 
 6064 // Floats vector div
 6065 instruct vdivF(vec dst, vec src) %{
 6066   predicate(UseAVX == 0);
 6067   match(Set dst (DivVF dst src));
 6068   format %{ "divps   $dst,$src\t! div packedF" %}
 6069   ins_encode %{
 6070     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6071   %}
 6072   ins_pipe( pipe_slow );
 6073 %}
 6074 
 6075 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6076   predicate(UseAVX > 0);
 6077   match(Set dst (DivVF src1 src2));
 6078   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6079   ins_encode %{
 6080     int vlen_enc = vector_length_encoding(this);
 6081     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6082   %}
 6083   ins_pipe( pipe_slow );
 6084 %}
 6085 
 6086 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6087   predicate((UseAVX > 0) &&
 6088             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6089   match(Set dst (DivVF src (LoadVector mem)));
 6090   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6091   ins_encode %{
 6092     int vlen_enc = vector_length_encoding(this);
 6093     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6094   %}
 6095   ins_pipe( pipe_slow );
 6096 %}
 6097 
 6098 // Doubles vector div
 6099 instruct vdivD(vec dst, vec src) %{
 6100   predicate(UseAVX == 0);
 6101   match(Set dst (DivVD dst src));
 6102   format %{ "divpd   $dst,$src\t! div packedD" %}
 6103   ins_encode %{
 6104     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6105   %}
 6106   ins_pipe( pipe_slow );
 6107 %}
 6108 
 6109 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6110   predicate(UseAVX > 0);
 6111   match(Set dst (DivVD src1 src2));
 6112   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6113   ins_encode %{
 6114     int vlen_enc = vector_length_encoding(this);
 6115     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6116   %}
 6117   ins_pipe( pipe_slow );
 6118 %}
 6119 
 6120 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6121   predicate((UseAVX > 0) &&
 6122             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6123   match(Set dst (DivVD src (LoadVector mem)));
 6124   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6125   ins_encode %{
 6126     int vlen_enc = vector_length_encoding(this);
 6127     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6128   %}
 6129   ins_pipe( pipe_slow );
 6130 %}
 6131 
 6132 // ------------------------------ MinMax ---------------------------------------
 6133 
 6134 // Byte, Short, Int vector Min/Max
 6135 instruct minmax_reg_sse(vec dst, vec src) %{
 6136   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6137             UseAVX == 0);
 6138   match(Set dst (MinV dst src));
 6139   match(Set dst (MaxV dst src));
 6140   format %{ "vector_minmax  $dst,$src\t!  " %}
 6141   ins_encode %{
 6142     assert(UseSSE >= 4, "required");
 6143 
 6144     int opcode = this->ideal_Opcode();
 6145     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6146     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6147   %}
 6148   ins_pipe( pipe_slow );
 6149 %}
 6150 
 6151 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6152   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6153             UseAVX > 0);
 6154   match(Set dst (MinV src1 src2));
 6155   match(Set dst (MaxV src1 src2));
 6156   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6157   ins_encode %{
 6158     int opcode = this->ideal_Opcode();
 6159     int vlen_enc = vector_length_encoding(this);
 6160     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6161 
 6162     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6163   %}
 6164   ins_pipe( pipe_slow );
 6165 %}
 6166 
 6167 // Long vector Min/Max
 6168 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6169   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6170             UseAVX == 0);
 6171   match(Set dst (MinV dst src));
 6172   match(Set dst (MaxV src dst));
 6173   effect(TEMP dst, TEMP tmp);
 6174   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6175   ins_encode %{
 6176     assert(UseSSE >= 4, "required");
 6177 
 6178     int opcode = this->ideal_Opcode();
 6179     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6180     assert(elem_bt == T_LONG, "sanity");
 6181 
 6182     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6183   %}
 6184   ins_pipe( pipe_slow );
 6185 %}
 6186 
 6187 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6188   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6189             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6190   match(Set dst (MinV src1 src2));
 6191   match(Set dst (MaxV src1 src2));
 6192   effect(TEMP dst);
 6193   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6194   ins_encode %{
 6195     int vlen_enc = vector_length_encoding(this);
 6196     int opcode = this->ideal_Opcode();
 6197     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6198     assert(elem_bt == T_LONG, "sanity");
 6199 
 6200     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6201   %}
 6202   ins_pipe( pipe_slow );
 6203 %}
 6204 
 6205 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6206   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6207             Matcher::vector_element_basic_type(n) == T_LONG);
 6208   match(Set dst (MinV src1 src2));
 6209   match(Set dst (MaxV src1 src2));
 6210   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6211   ins_encode %{
 6212     assert(UseAVX > 2, "required");
 6213 
 6214     int vlen_enc = vector_length_encoding(this);
 6215     int opcode = this->ideal_Opcode();
 6216     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6217     assert(elem_bt == T_LONG, "sanity");
 6218 
 6219     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6220   %}
 6221   ins_pipe( pipe_slow );
 6222 %}
 6223 
 6224 // Float/Double vector Min/Max
 6225 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6226   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6227             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6228             UseAVX > 0);
 6229   match(Set dst (MinV a b));
 6230   match(Set dst (MaxV a b));
 6231   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6232   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6233   ins_encode %{
 6234     assert(UseAVX > 0, "required");
 6235 
 6236     int opcode = this->ideal_Opcode();
 6237     int vlen_enc = vector_length_encoding(this);
 6238     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6239 
 6240     __ vminmax_fp(opcode, elem_bt,
 6241                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6242                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6243   %}
 6244   ins_pipe( pipe_slow );
 6245 %}
 6246 
 6247 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6248   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6249             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6250   match(Set dst (MinV a b));
 6251   match(Set dst (MaxV a b));
 6252   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6253   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6254   ins_encode %{
 6255     assert(UseAVX > 2, "required");
 6256 
 6257     int opcode = this->ideal_Opcode();
 6258     int vlen_enc = vector_length_encoding(this);
 6259     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6260 
 6261     __ evminmax_fp(opcode, elem_bt,
 6262                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6263                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6264   %}
 6265   ins_pipe( pipe_slow );
 6266 %}
 6267 
 6268 // --------------------------------- Signum/CopySign ---------------------------
 6269 
 6270 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6271   match(Set dst (SignumF dst (Binary zero one)));
 6272   effect(KILL cr);
 6273   format %{ "signumF $dst, $dst" %}
 6274   ins_encode %{
 6275     int opcode = this->ideal_Opcode();
 6276     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6277   %}
 6278   ins_pipe( pipe_slow );
 6279 %}
 6280 
 6281 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6282   match(Set dst (SignumD dst (Binary zero one)));
 6283   effect(KILL cr);
 6284   format %{ "signumD $dst, $dst" %}
 6285   ins_encode %{
 6286     int opcode = this->ideal_Opcode();
 6287     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6288   %}
 6289   ins_pipe( pipe_slow );
 6290 %}
 6291 
 6292 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6293   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6294   match(Set dst (SignumVF src (Binary zero one)));
 6295   match(Set dst (SignumVD src (Binary zero one)));
 6296   effect(TEMP dst, TEMP xtmp1);
 6297   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6298   ins_encode %{
 6299     int opcode = this->ideal_Opcode();
 6300     int vec_enc = vector_length_encoding(this);
 6301     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6302                          $xtmp1$$XMMRegister, vec_enc);
 6303   %}
 6304   ins_pipe( pipe_slow );
 6305 %}
 6306 
 6307 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6308   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6309   match(Set dst (SignumVF src (Binary zero one)));
 6310   match(Set dst (SignumVD src (Binary zero one)));
 6311   effect(TEMP dst, TEMP ktmp1);
 6312   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6313   ins_encode %{
 6314     int opcode = this->ideal_Opcode();
 6315     int vec_enc = vector_length_encoding(this);
 6316     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6317                           $ktmp1$$KRegister, vec_enc);
 6318   %}
 6319   ins_pipe( pipe_slow );
 6320 %}
 6321 
 6322 // ---------------------------------------
 6323 // For copySign use 0xE4 as writemask for vpternlog
 6324 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6325 // C (xmm2) is set to 0x7FFFFFFF
 6326 // Wherever xmm2 is 0, we want to pick from B (sign)
 6327 // Wherever xmm2 is 1, we want to pick from A (src)
 6328 //
 6329 // A B C Result
 6330 // 0 0 0 0
 6331 // 0 0 1 0
 6332 // 0 1 0 1
 6333 // 0 1 1 0
 6334 // 1 0 0 0
 6335 // 1 0 1 1
 6336 // 1 1 0 1
 6337 // 1 1 1 1
 6338 //
 6339 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6340 // ---------------------------------------
 6341 
 6342 #ifdef _LP64
 6343 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6344   match(Set dst (CopySignF dst src));
 6345   effect(TEMP tmp1, TEMP tmp2);
 6346   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6347   ins_encode %{
 6348     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6349     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6350     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6351   %}
 6352   ins_pipe( pipe_slow );
 6353 %}
 6354 
 6355 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6356   match(Set dst (CopySignD dst (Binary src zero)));
 6357   ins_cost(100);
 6358   effect(TEMP tmp1, TEMP tmp2);
 6359   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6360   ins_encode %{
 6361     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6362     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6363     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6364   %}
 6365   ins_pipe( pipe_slow );
 6366 %}
 6367 
 6368 #endif // _LP64
 6369 
 6370 //----------------------------- CompressBits/ExpandBits ------------------------
 6371 
 6372 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6373   predicate(n->bottom_type()->isa_int());
 6374   match(Set dst (CompressBits src mask));
 6375   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6376   ins_encode %{
 6377     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6378   %}
 6379   ins_pipe( pipe_slow );
 6380 %}
 6381 
 6382 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6383   predicate(n->bottom_type()->isa_int());
 6384   match(Set dst (ExpandBits src mask));
 6385   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6386   ins_encode %{
 6387     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6388   %}
 6389   ins_pipe( pipe_slow );
 6390 %}
 6391 
 6392 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6393   predicate(n->bottom_type()->isa_int());
 6394   match(Set dst (CompressBits src (LoadI mask)));
 6395   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6396   ins_encode %{
 6397     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6398   %}
 6399   ins_pipe( pipe_slow );
 6400 %}
 6401 
 6402 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6403   predicate(n->bottom_type()->isa_int());
 6404   match(Set dst (ExpandBits src (LoadI mask)));
 6405   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6406   ins_encode %{
 6407     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6408   %}
 6409   ins_pipe( pipe_slow );
 6410 %}
 6411 
 6412 // --------------------------------- Sqrt --------------------------------------
 6413 
 6414 instruct vsqrtF_reg(vec dst, vec src) %{
 6415   match(Set dst (SqrtVF src));
 6416   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6417   ins_encode %{
 6418     assert(UseAVX > 0, "required");
 6419     int vlen_enc = vector_length_encoding(this);
 6420     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6421   %}
 6422   ins_pipe( pipe_slow );
 6423 %}
 6424 
 6425 instruct vsqrtF_mem(vec dst, memory mem) %{
 6426   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6427   match(Set dst (SqrtVF (LoadVector mem)));
 6428   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6429   ins_encode %{
 6430     assert(UseAVX > 0, "required");
 6431     int vlen_enc = vector_length_encoding(this);
 6432     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6433   %}
 6434   ins_pipe( pipe_slow );
 6435 %}
 6436 
 6437 // Floating point vector sqrt
 6438 instruct vsqrtD_reg(vec dst, vec src) %{
 6439   match(Set dst (SqrtVD src));
 6440   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6441   ins_encode %{
 6442     assert(UseAVX > 0, "required");
 6443     int vlen_enc = vector_length_encoding(this);
 6444     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6445   %}
 6446   ins_pipe( pipe_slow );
 6447 %}
 6448 
 6449 instruct vsqrtD_mem(vec dst, memory mem) %{
 6450   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6451   match(Set dst (SqrtVD (LoadVector mem)));
 6452   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6453   ins_encode %{
 6454     assert(UseAVX > 0, "required");
 6455     int vlen_enc = vector_length_encoding(this);
 6456     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6457   %}
 6458   ins_pipe( pipe_slow );
 6459 %}
 6460 
 6461 // ------------------------------ Shift ---------------------------------------
 6462 
 6463 // Left and right shift count vectors are the same on x86
 6464 // (only lowest bits of xmm reg are used for count).
 6465 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6466   match(Set dst (LShiftCntV cnt));
 6467   match(Set dst (RShiftCntV cnt));
 6468   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6469   ins_encode %{
 6470     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6471   %}
 6472   ins_pipe( pipe_slow );
 6473 %}
 6474 
 6475 // Byte vector shift
 6476 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6477   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6478   match(Set dst ( LShiftVB src shift));
 6479   match(Set dst ( RShiftVB src shift));
 6480   match(Set dst (URShiftVB src shift));
 6481   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6482   format %{"vector_byte_shift $dst,$src,$shift" %}
 6483   ins_encode %{
 6484     assert(UseSSE > 3, "required");
 6485     int opcode = this->ideal_Opcode();
 6486     bool sign = (opcode != Op_URShiftVB);
 6487     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6488     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6489     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6490     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6491     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6492   %}
 6493   ins_pipe( pipe_slow );
 6494 %}
 6495 
 6496 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6497   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6498             UseAVX <= 1);
 6499   match(Set dst ( LShiftVB src shift));
 6500   match(Set dst ( RShiftVB src shift));
 6501   match(Set dst (URShiftVB src shift));
 6502   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6503   format %{"vector_byte_shift $dst,$src,$shift" %}
 6504   ins_encode %{
 6505     assert(UseSSE > 3, "required");
 6506     int opcode = this->ideal_Opcode();
 6507     bool sign = (opcode != Op_URShiftVB);
 6508     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6509     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6510     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6511     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6512     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6513     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6514     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6515     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6516     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6517   %}
 6518   ins_pipe( pipe_slow );
 6519 %}
 6520 
 6521 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6522   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6523             UseAVX > 1);
 6524   match(Set dst ( LShiftVB src shift));
 6525   match(Set dst ( RShiftVB src shift));
 6526   match(Set dst (URShiftVB src shift));
 6527   effect(TEMP dst, TEMP tmp);
 6528   format %{"vector_byte_shift $dst,$src,$shift" %}
 6529   ins_encode %{
 6530     int opcode = this->ideal_Opcode();
 6531     bool sign = (opcode != Op_URShiftVB);
 6532     int vlen_enc = Assembler::AVX_256bit;
 6533     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6534     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6535     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6536     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6537     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6538   %}
 6539   ins_pipe( pipe_slow );
 6540 %}
 6541 
 6542 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6543   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6544   match(Set dst ( LShiftVB src shift));
 6545   match(Set dst ( RShiftVB src shift));
 6546   match(Set dst (URShiftVB src shift));
 6547   effect(TEMP dst, TEMP tmp);
 6548   format %{"vector_byte_shift $dst,$src,$shift" %}
 6549   ins_encode %{
 6550     assert(UseAVX > 1, "required");
 6551     int opcode = this->ideal_Opcode();
 6552     bool sign = (opcode != Op_URShiftVB);
 6553     int vlen_enc = Assembler::AVX_256bit;
 6554     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6555     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6556     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6557     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6558     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6559     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6560     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6561     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6562     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6563   %}
 6564   ins_pipe( pipe_slow );
 6565 %}
 6566 
 6567 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6568   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6569   match(Set dst ( LShiftVB src shift));
 6570   match(Set dst  (RShiftVB src shift));
 6571   match(Set dst (URShiftVB src shift));
 6572   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6573   format %{"vector_byte_shift $dst,$src,$shift" %}
 6574   ins_encode %{
 6575     assert(UseAVX > 2, "required");
 6576     int opcode = this->ideal_Opcode();
 6577     bool sign = (opcode != Op_URShiftVB);
 6578     int vlen_enc = Assembler::AVX_512bit;
 6579     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6580     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6581     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6582     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6583     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6584     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6585     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6586     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6587     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6588     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6589     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6590     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6591   %}
 6592   ins_pipe( pipe_slow );
 6593 %}
 6594 
 6595 // Shorts vector logical right shift produces incorrect Java result
 6596 // for negative data because java code convert short value into int with
 6597 // sign extension before a shift. But char vectors are fine since chars are
 6598 // unsigned values.
 6599 // Shorts/Chars vector left shift
 6600 instruct vshiftS(vec dst, vec src, vec shift) %{
 6601   predicate(!n->as_ShiftV()->is_var_shift());
 6602   match(Set dst ( LShiftVS src shift));
 6603   match(Set dst ( RShiftVS src shift));
 6604   match(Set dst (URShiftVS src shift));
 6605   effect(TEMP dst, USE src, USE shift);
 6606   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6607   ins_encode %{
 6608     int opcode = this->ideal_Opcode();
 6609     if (UseAVX > 0) {
 6610       int vlen_enc = vector_length_encoding(this);
 6611       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6612     } else {
 6613       int vlen = Matcher::vector_length(this);
 6614       if (vlen == 2) {
 6615         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6616         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6617       } else if (vlen == 4) {
 6618         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6619         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6620       } else {
 6621         assert (vlen == 8, "sanity");
 6622         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6623         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6624       }
 6625     }
 6626   %}
 6627   ins_pipe( pipe_slow );
 6628 %}
 6629 
 6630 // Integers vector left shift
 6631 instruct vshiftI(vec dst, vec src, vec shift) %{
 6632   predicate(!n->as_ShiftV()->is_var_shift());
 6633   match(Set dst ( LShiftVI src shift));
 6634   match(Set dst ( RShiftVI src shift));
 6635   match(Set dst (URShiftVI src shift));
 6636   effect(TEMP dst, USE src, USE shift);
 6637   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6638   ins_encode %{
 6639     int opcode = this->ideal_Opcode();
 6640     if (UseAVX > 0) {
 6641       int vlen_enc = vector_length_encoding(this);
 6642       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6643     } else {
 6644       int vlen = Matcher::vector_length(this);
 6645       if (vlen == 2) {
 6646         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6647         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6648       } else {
 6649         assert(vlen == 4, "sanity");
 6650         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6651         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6652       }
 6653     }
 6654   %}
 6655   ins_pipe( pipe_slow );
 6656 %}
 6657 
 6658 // Integers vector left constant shift
 6659 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6660   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6661   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6662   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6663   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6664   ins_encode %{
 6665     int opcode = this->ideal_Opcode();
 6666     if (UseAVX > 0) {
 6667       int vector_len = vector_length_encoding(this);
 6668       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6669     } else {
 6670       int vlen = Matcher::vector_length(this);
 6671       if (vlen == 2) {
 6672         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6673         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6674       } else {
 6675         assert(vlen == 4, "sanity");
 6676         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6677         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6678       }
 6679     }
 6680   %}
 6681   ins_pipe( pipe_slow );
 6682 %}
 6683 
 6684 // Longs vector shift
 6685 instruct vshiftL(vec dst, vec src, vec shift) %{
 6686   predicate(!n->as_ShiftV()->is_var_shift());
 6687   match(Set dst ( LShiftVL src shift));
 6688   match(Set dst (URShiftVL src shift));
 6689   effect(TEMP dst, USE src, USE shift);
 6690   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6691   ins_encode %{
 6692     int opcode = this->ideal_Opcode();
 6693     if (UseAVX > 0) {
 6694       int vlen_enc = vector_length_encoding(this);
 6695       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6696     } else {
 6697       assert(Matcher::vector_length(this) == 2, "");
 6698       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6699       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6700     }
 6701   %}
 6702   ins_pipe( pipe_slow );
 6703 %}
 6704 
 6705 // Longs vector constant shift
 6706 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6707   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6708   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6709   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6710   ins_encode %{
 6711     int opcode = this->ideal_Opcode();
 6712     if (UseAVX > 0) {
 6713       int vector_len = vector_length_encoding(this);
 6714       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6715     } else {
 6716       assert(Matcher::vector_length(this) == 2, "");
 6717       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6718       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6719     }
 6720   %}
 6721   ins_pipe( pipe_slow );
 6722 %}
 6723 
 6724 // -------------------ArithmeticRightShift -----------------------------------
 6725 // Long vector arithmetic right shift
 6726 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6727   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6728   match(Set dst (RShiftVL src shift));
 6729   effect(TEMP dst, TEMP tmp);
 6730   format %{ "vshiftq $dst,$src,$shift" %}
 6731   ins_encode %{
 6732     uint vlen = Matcher::vector_length(this);
 6733     if (vlen == 2) {
 6734       assert(UseSSE >= 2, "required");
 6735       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6736       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6737       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6738       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6739       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6740       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6741     } else {
 6742       assert(vlen == 4, "sanity");
 6743       assert(UseAVX > 1, "required");
 6744       int vlen_enc = Assembler::AVX_256bit;
 6745       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6746       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6747       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6748       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6749       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6750     }
 6751   %}
 6752   ins_pipe( pipe_slow );
 6753 %}
 6754 
 6755 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6756   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6757   match(Set dst (RShiftVL src shift));
 6758   format %{ "vshiftq $dst,$src,$shift" %}
 6759   ins_encode %{
 6760     int vlen_enc = vector_length_encoding(this);
 6761     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6762   %}
 6763   ins_pipe( pipe_slow );
 6764 %}
 6765 
 6766 // ------------------- Variable Shift -----------------------------
 6767 // Byte variable shift
 6768 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6769   predicate(Matcher::vector_length(n) <= 8 &&
 6770             n->as_ShiftV()->is_var_shift() &&
 6771             !VM_Version::supports_avx512bw());
 6772   match(Set dst ( LShiftVB src shift));
 6773   match(Set dst ( RShiftVB src shift));
 6774   match(Set dst (URShiftVB src shift));
 6775   effect(TEMP dst, TEMP vtmp);
 6776   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6777   ins_encode %{
 6778     assert(UseAVX >= 2, "required");
 6779 
 6780     int opcode = this->ideal_Opcode();
 6781     int vlen_enc = Assembler::AVX_128bit;
 6782     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6783     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6784   %}
 6785   ins_pipe( pipe_slow );
 6786 %}
 6787 
 6788 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6789   predicate(Matcher::vector_length(n) == 16 &&
 6790             n->as_ShiftV()->is_var_shift() &&
 6791             !VM_Version::supports_avx512bw());
 6792   match(Set dst ( LShiftVB src shift));
 6793   match(Set dst ( RShiftVB src shift));
 6794   match(Set dst (URShiftVB src shift));
 6795   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6796   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6797   ins_encode %{
 6798     assert(UseAVX >= 2, "required");
 6799 
 6800     int opcode = this->ideal_Opcode();
 6801     int vlen_enc = Assembler::AVX_128bit;
 6802     // Shift lower half and get word result in dst
 6803     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6804 
 6805     // Shift upper half and get word result in vtmp1
 6806     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6807     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6808     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6809 
 6810     // Merge and down convert the two word results to byte in dst
 6811     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6812   %}
 6813   ins_pipe( pipe_slow );
 6814 %}
 6815 
 6816 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6817   predicate(Matcher::vector_length(n) == 32 &&
 6818             n->as_ShiftV()->is_var_shift() &&
 6819             !VM_Version::supports_avx512bw());
 6820   match(Set dst ( LShiftVB src shift));
 6821   match(Set dst ( RShiftVB src shift));
 6822   match(Set dst (URShiftVB src shift));
 6823   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6824   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6825   ins_encode %{
 6826     assert(UseAVX >= 2, "required");
 6827 
 6828     int opcode = this->ideal_Opcode();
 6829     int vlen_enc = Assembler::AVX_128bit;
 6830     // Process lower 128 bits and get result in dst
 6831     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6832     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6833     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6834     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6835     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6836 
 6837     // Process higher 128 bits and get result in vtmp3
 6838     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6839     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6840     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6841     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6842     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6843     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6844     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6845 
 6846     // Merge the two results in dst
 6847     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6848   %}
 6849   ins_pipe( pipe_slow );
 6850 %}
 6851 
 6852 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6853   predicate(Matcher::vector_length(n) <= 32 &&
 6854             n->as_ShiftV()->is_var_shift() &&
 6855             VM_Version::supports_avx512bw());
 6856   match(Set dst ( LShiftVB src shift));
 6857   match(Set dst ( RShiftVB src shift));
 6858   match(Set dst (URShiftVB src shift));
 6859   effect(TEMP dst, TEMP vtmp);
 6860   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6861   ins_encode %{
 6862     assert(UseAVX > 2, "required");
 6863 
 6864     int opcode = this->ideal_Opcode();
 6865     int vlen_enc = vector_length_encoding(this);
 6866     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6867   %}
 6868   ins_pipe( pipe_slow );
 6869 %}
 6870 
 6871 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6872   predicate(Matcher::vector_length(n) == 64 &&
 6873             n->as_ShiftV()->is_var_shift() &&
 6874             VM_Version::supports_avx512bw());
 6875   match(Set dst ( LShiftVB src shift));
 6876   match(Set dst ( RShiftVB src shift));
 6877   match(Set dst (URShiftVB src shift));
 6878   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6879   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6880   ins_encode %{
 6881     assert(UseAVX > 2, "required");
 6882 
 6883     int opcode = this->ideal_Opcode();
 6884     int vlen_enc = Assembler::AVX_256bit;
 6885     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6886     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6887     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6888     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6889     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6890   %}
 6891   ins_pipe( pipe_slow );
 6892 %}
 6893 
 6894 // Short variable shift
 6895 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6896   predicate(Matcher::vector_length(n) <= 8 &&
 6897             n->as_ShiftV()->is_var_shift() &&
 6898             !VM_Version::supports_avx512bw());
 6899   match(Set dst ( LShiftVS src shift));
 6900   match(Set dst ( RShiftVS src shift));
 6901   match(Set dst (URShiftVS src shift));
 6902   effect(TEMP dst, TEMP vtmp);
 6903   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6904   ins_encode %{
 6905     assert(UseAVX >= 2, "required");
 6906 
 6907     int opcode = this->ideal_Opcode();
 6908     bool sign = (opcode != Op_URShiftVS);
 6909     int vlen_enc = Assembler::AVX_256bit;
 6910     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6911     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6912     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6913     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6914     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6915     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6916   %}
 6917   ins_pipe( pipe_slow );
 6918 %}
 6919 
 6920 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6921   predicate(Matcher::vector_length(n) == 16 &&
 6922             n->as_ShiftV()->is_var_shift() &&
 6923             !VM_Version::supports_avx512bw());
 6924   match(Set dst ( LShiftVS src shift));
 6925   match(Set dst ( RShiftVS src shift));
 6926   match(Set dst (URShiftVS src shift));
 6927   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6928   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6929   ins_encode %{
 6930     assert(UseAVX >= 2, "required");
 6931 
 6932     int opcode = this->ideal_Opcode();
 6933     bool sign = (opcode != Op_URShiftVS);
 6934     int vlen_enc = Assembler::AVX_256bit;
 6935     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6936     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6937     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6938     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6939     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6940 
 6941     // Shift upper half, with result in dst using vtmp1 as TEMP
 6942     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6943     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6944     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6945     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6946     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6947     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6948 
 6949     // Merge lower and upper half result into dst
 6950     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6951     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6952   %}
 6953   ins_pipe( pipe_slow );
 6954 %}
 6955 
 6956 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6957   predicate(n->as_ShiftV()->is_var_shift() &&
 6958             VM_Version::supports_avx512bw());
 6959   match(Set dst ( LShiftVS src shift));
 6960   match(Set dst ( RShiftVS src shift));
 6961   match(Set dst (URShiftVS src shift));
 6962   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6963   ins_encode %{
 6964     assert(UseAVX > 2, "required");
 6965 
 6966     int opcode = this->ideal_Opcode();
 6967     int vlen_enc = vector_length_encoding(this);
 6968     if (!VM_Version::supports_avx512vl()) {
 6969       vlen_enc = Assembler::AVX_512bit;
 6970     }
 6971     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6972   %}
 6973   ins_pipe( pipe_slow );
 6974 %}
 6975 
 6976 //Integer variable shift
 6977 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6978   predicate(n->as_ShiftV()->is_var_shift());
 6979   match(Set dst ( LShiftVI src shift));
 6980   match(Set dst ( RShiftVI src shift));
 6981   match(Set dst (URShiftVI src shift));
 6982   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6983   ins_encode %{
 6984     assert(UseAVX >= 2, "required");
 6985 
 6986     int opcode = this->ideal_Opcode();
 6987     int vlen_enc = vector_length_encoding(this);
 6988     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6989   %}
 6990   ins_pipe( pipe_slow );
 6991 %}
 6992 
 6993 //Long variable shift
 6994 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6995   predicate(n->as_ShiftV()->is_var_shift());
 6996   match(Set dst ( LShiftVL src shift));
 6997   match(Set dst (URShiftVL src shift));
 6998   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6999   ins_encode %{
 7000     assert(UseAVX >= 2, "required");
 7001 
 7002     int opcode = this->ideal_Opcode();
 7003     int vlen_enc = vector_length_encoding(this);
 7004     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7005   %}
 7006   ins_pipe( pipe_slow );
 7007 %}
 7008 
 7009 //Long variable right shift arithmetic
 7010 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7011   predicate(Matcher::vector_length(n) <= 4 &&
 7012             n->as_ShiftV()->is_var_shift() &&
 7013             UseAVX == 2);
 7014   match(Set dst (RShiftVL src shift));
 7015   effect(TEMP dst, TEMP vtmp);
 7016   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7017   ins_encode %{
 7018     int opcode = this->ideal_Opcode();
 7019     int vlen_enc = vector_length_encoding(this);
 7020     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7021                  $vtmp$$XMMRegister);
 7022   %}
 7023   ins_pipe( pipe_slow );
 7024 %}
 7025 
 7026 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7027   predicate(n->as_ShiftV()->is_var_shift() &&
 7028             UseAVX > 2);
 7029   match(Set dst (RShiftVL src shift));
 7030   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7031   ins_encode %{
 7032     int opcode = this->ideal_Opcode();
 7033     int vlen_enc = vector_length_encoding(this);
 7034     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7035   %}
 7036   ins_pipe( pipe_slow );
 7037 %}
 7038 
 7039 // --------------------------------- AND --------------------------------------
 7040 
 7041 instruct vand(vec dst, vec src) %{
 7042   predicate(UseAVX == 0);
 7043   match(Set dst (AndV dst src));
 7044   format %{ "pand    $dst,$src\t! and vectors" %}
 7045   ins_encode %{
 7046     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7047   %}
 7048   ins_pipe( pipe_slow );
 7049 %}
 7050 
 7051 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7052   predicate(UseAVX > 0);
 7053   match(Set dst (AndV src1 src2));
 7054   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7055   ins_encode %{
 7056     int vlen_enc = vector_length_encoding(this);
 7057     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7058   %}
 7059   ins_pipe( pipe_slow );
 7060 %}
 7061 
 7062 instruct vand_mem(vec dst, vec src, memory mem) %{
 7063   predicate((UseAVX > 0) &&
 7064             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7065   match(Set dst (AndV src (LoadVector mem)));
 7066   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7067   ins_encode %{
 7068     int vlen_enc = vector_length_encoding(this);
 7069     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7070   %}
 7071   ins_pipe( pipe_slow );
 7072 %}
 7073 
 7074 // --------------------------------- OR ---------------------------------------
 7075 
 7076 instruct vor(vec dst, vec src) %{
 7077   predicate(UseAVX == 0);
 7078   match(Set dst (OrV dst src));
 7079   format %{ "por     $dst,$src\t! or vectors" %}
 7080   ins_encode %{
 7081     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7082   %}
 7083   ins_pipe( pipe_slow );
 7084 %}
 7085 
 7086 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7087   predicate(UseAVX > 0);
 7088   match(Set dst (OrV src1 src2));
 7089   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7090   ins_encode %{
 7091     int vlen_enc = vector_length_encoding(this);
 7092     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7093   %}
 7094   ins_pipe( pipe_slow );
 7095 %}
 7096 
 7097 instruct vor_mem(vec dst, vec src, memory mem) %{
 7098   predicate((UseAVX > 0) &&
 7099             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7100   match(Set dst (OrV src (LoadVector mem)));
 7101   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7102   ins_encode %{
 7103     int vlen_enc = vector_length_encoding(this);
 7104     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7105   %}
 7106   ins_pipe( pipe_slow );
 7107 %}
 7108 
 7109 // --------------------------------- XOR --------------------------------------
 7110 
 7111 instruct vxor(vec dst, vec src) %{
 7112   predicate(UseAVX == 0);
 7113   match(Set dst (XorV dst src));
 7114   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7115   ins_encode %{
 7116     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7117   %}
 7118   ins_pipe( pipe_slow );
 7119 %}
 7120 
 7121 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7122   predicate(UseAVX > 0);
 7123   match(Set dst (XorV src1 src2));
 7124   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7125   ins_encode %{
 7126     int vlen_enc = vector_length_encoding(this);
 7127     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7128   %}
 7129   ins_pipe( pipe_slow );
 7130 %}
 7131 
 7132 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7133   predicate((UseAVX > 0) &&
 7134             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7135   match(Set dst (XorV src (LoadVector mem)));
 7136   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7137   ins_encode %{
 7138     int vlen_enc = vector_length_encoding(this);
 7139     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7140   %}
 7141   ins_pipe( pipe_slow );
 7142 %}
 7143 
 7144 // --------------------------------- VectorCast --------------------------------------
 7145 
 7146 instruct vcastBtoX(vec dst, vec src) %{
 7147   match(Set dst (VectorCastB2X src));
 7148   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7149   ins_encode %{
 7150     assert(UseAVX > 0, "required");
 7151 
 7152     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7153     int vlen_enc = vector_length_encoding(this);
 7154     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7155   %}
 7156   ins_pipe( pipe_slow );
 7157 %}
 7158 
 7159 instruct castStoX(vec dst, vec src) %{
 7160   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7161             Matcher::vector_length(n->in(1)) <= 8 && // src
 7162             Matcher::vector_element_basic_type(n) == T_BYTE);
 7163   match(Set dst (VectorCastS2X src));
 7164   format %{ "vector_cast_s2x $dst,$src" %}
 7165   ins_encode %{
 7166     assert(UseAVX > 0, "required");
 7167 
 7168     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7169     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7170   %}
 7171   ins_pipe( pipe_slow );
 7172 %}
 7173 
 7174 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7175   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7176             Matcher::vector_length(n->in(1)) == 16 && // src
 7177             Matcher::vector_element_basic_type(n) == T_BYTE);
 7178   effect(TEMP dst, TEMP vtmp);
 7179   match(Set dst (VectorCastS2X src));
 7180   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7181   ins_encode %{
 7182     assert(UseAVX > 0, "required");
 7183 
 7184     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7185     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7186     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7187     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7188   %}
 7189   ins_pipe( pipe_slow );
 7190 %}
 7191 
 7192 instruct vcastStoX_evex(vec dst, vec src) %{
 7193   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7194             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7195   match(Set dst (VectorCastS2X src));
 7196   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7197   ins_encode %{
 7198     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7199     int src_vlen_enc = vector_length_encoding(this, $src);
 7200     int vlen_enc = vector_length_encoding(this);
 7201     switch (to_elem_bt) {
 7202       case T_BYTE:
 7203         if (!VM_Version::supports_avx512vl()) {
 7204           vlen_enc = Assembler::AVX_512bit;
 7205         }
 7206         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7207         break;
 7208       case T_INT:
 7209         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7210         break;
 7211       case T_FLOAT:
 7212         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7213         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7214         break;
 7215       case T_LONG:
 7216         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7217         break;
 7218       case T_DOUBLE: {
 7219         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7220         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7221         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7222         break;
 7223       }
 7224       default:
 7225         ShouldNotReachHere();
 7226     }
 7227   %}
 7228   ins_pipe( pipe_slow );
 7229 %}
 7230 
 7231 instruct castItoX(vec dst, vec src) %{
 7232   predicate(UseAVX <= 2 &&
 7233             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7234             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7235   match(Set dst (VectorCastI2X src));
 7236   format %{ "vector_cast_i2x $dst,$src" %}
 7237   ins_encode %{
 7238     assert(UseAVX > 0, "required");
 7239 
 7240     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7241     int vlen_enc = vector_length_encoding(this, $src);
 7242 
 7243     if (to_elem_bt == T_BYTE) {
 7244       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7245       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7246       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7247     } else {
 7248       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7249       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7250       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7251     }
 7252   %}
 7253   ins_pipe( pipe_slow );
 7254 %}
 7255 
 7256 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7257   predicate(UseAVX <= 2 &&
 7258             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7259             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7260   match(Set dst (VectorCastI2X src));
 7261   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7262   effect(TEMP dst, TEMP vtmp);
 7263   ins_encode %{
 7264     assert(UseAVX > 0, "required");
 7265 
 7266     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7267     int vlen_enc = vector_length_encoding(this, $src);
 7268 
 7269     if (to_elem_bt == T_BYTE) {
 7270       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7271       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7272       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7273       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7274     } else {
 7275       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7276       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7277       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7278       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7279     }
 7280   %}
 7281   ins_pipe( pipe_slow );
 7282 %}
 7283 
 7284 instruct vcastItoX_evex(vec dst, vec src) %{
 7285   predicate(UseAVX > 2 ||
 7286             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7287   match(Set dst (VectorCastI2X src));
 7288   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7289   ins_encode %{
 7290     assert(UseAVX > 0, "required");
 7291 
 7292     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7293     int src_vlen_enc = vector_length_encoding(this, $src);
 7294     int dst_vlen_enc = vector_length_encoding(this);
 7295     switch (dst_elem_bt) {
 7296       case T_BYTE:
 7297         if (!VM_Version::supports_avx512vl()) {
 7298           src_vlen_enc = Assembler::AVX_512bit;
 7299         }
 7300         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7301         break;
 7302       case T_SHORT:
 7303         if (!VM_Version::supports_avx512vl()) {
 7304           src_vlen_enc = Assembler::AVX_512bit;
 7305         }
 7306         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7307         break;
 7308       case T_FLOAT:
 7309         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7310         break;
 7311       case T_LONG:
 7312         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7313         break;
 7314       case T_DOUBLE:
 7315         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7316         break;
 7317       default:
 7318         ShouldNotReachHere();
 7319     }
 7320   %}
 7321   ins_pipe( pipe_slow );
 7322 %}
 7323 
 7324 instruct vcastLtoBS(vec dst, vec src) %{
 7325   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7326             UseAVX <= 2);
 7327   match(Set dst (VectorCastL2X src));
 7328   format %{ "vector_cast_l2x  $dst,$src" %}
 7329   ins_encode %{
 7330     assert(UseAVX > 0, "required");
 7331 
 7332     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7333     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7334     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7335                                                       : ExternalAddress(vector_int_to_short_mask());
 7336     if (vlen <= 16) {
 7337       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7338       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7339       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7340     } else {
 7341       assert(vlen <= 32, "required");
 7342       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7343       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7344       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7345       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7346     }
 7347     if (to_elem_bt == T_BYTE) {
 7348       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7349     }
 7350   %}
 7351   ins_pipe( pipe_slow );
 7352 %}
 7353 
 7354 instruct vcastLtoX_evex(vec dst, vec src) %{
 7355   predicate(UseAVX > 2 ||
 7356             (Matcher::vector_element_basic_type(n) == T_INT ||
 7357              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7358              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7359   match(Set dst (VectorCastL2X src));
 7360   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7361   ins_encode %{
 7362     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7363     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7364     int vlen_enc = vector_length_encoding(this, $src);
 7365     switch (to_elem_bt) {
 7366       case T_BYTE:
 7367         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7368           vlen_enc = Assembler::AVX_512bit;
 7369         }
 7370         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7371         break;
 7372       case T_SHORT:
 7373         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7374           vlen_enc = Assembler::AVX_512bit;
 7375         }
 7376         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7377         break;
 7378       case T_INT:
 7379         if (vlen == 8) {
 7380           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7381             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7382           }
 7383         } else if (vlen == 16) {
 7384           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7385         } else if (vlen == 32) {
 7386           if (UseAVX > 2) {
 7387             if (!VM_Version::supports_avx512vl()) {
 7388               vlen_enc = Assembler::AVX_512bit;
 7389             }
 7390             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7391           } else {
 7392             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7393             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7394           }
 7395         } else { // vlen == 64
 7396           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7397         }
 7398         break;
 7399       case T_FLOAT:
 7400         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7401         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7402         break;
 7403       case T_DOUBLE:
 7404         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7405         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7406         break;
 7407 
 7408       default: assert(false, "%s", type2name(to_elem_bt));
 7409     }
 7410   %}
 7411   ins_pipe( pipe_slow );
 7412 %}
 7413 
 7414 instruct vcastFtoD_reg(vec dst, vec src) %{
 7415   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7416   match(Set dst (VectorCastF2X src));
 7417   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7418   ins_encode %{
 7419     int vlen_enc = vector_length_encoding(this);
 7420     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7421   %}
 7422   ins_pipe( pipe_slow );
 7423 %}
 7424 
 7425 
 7426 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7427   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7428             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7429   match(Set dst (VectorCastF2X src));
 7430   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7431   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7432   ins_encode %{
 7433     int vlen_enc = vector_length_encoding(this, $src);
 7434     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7435     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7436     // 32 bit addresses for register indirect addressing mode since stub constants
 7437     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7438     // However, targets are free to increase this limit, but having a large code cache size
 7439     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7440     // cap we save a temporary register allocation which in limiting case can prevent
 7441     // spilling in high register pressure blocks.
 7442     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7443                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7444                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7445   %}
 7446   ins_pipe( pipe_slow );
 7447 %}
 7448 
 7449 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7450   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7451             is_integral_type(Matcher::vector_element_basic_type(n)));
 7452   match(Set dst (VectorCastF2X src));
 7453   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7454   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7455   ins_encode %{
 7456     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7457     if (to_elem_bt == T_LONG) {
 7458       int vlen_enc = vector_length_encoding(this);
 7459       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7460                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7461                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7462     } else {
 7463       int vlen_enc = vector_length_encoding(this, $src);
 7464       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7465                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7466                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7467     }
 7468   %}
 7469   ins_pipe( pipe_slow );
 7470 %}
 7471 
 7472 instruct vcastDtoF_reg(vec dst, vec src) %{
 7473   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7474   match(Set dst (VectorCastD2X src));
 7475   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7476   ins_encode %{
 7477     int vlen_enc = vector_length_encoding(this, $src);
 7478     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7479   %}
 7480   ins_pipe( pipe_slow );
 7481 %}
 7482 
 7483 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7484   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7485             is_integral_type(Matcher::vector_element_basic_type(n)));
 7486   match(Set dst (VectorCastD2X src));
 7487   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7488   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7489   ins_encode %{
 7490     int vlen_enc = vector_length_encoding(this, $src);
 7491     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7492     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7493                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7494                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7495   %}
 7496   ins_pipe( pipe_slow );
 7497 %}
 7498 
 7499 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7500   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7501             is_integral_type(Matcher::vector_element_basic_type(n)));
 7502   match(Set dst (VectorCastD2X src));
 7503   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7504   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7505   ins_encode %{
 7506     int vlen_enc = vector_length_encoding(this, $src);
 7507     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7508     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7509                               ExternalAddress(vector_float_signflip());
 7510     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7511                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7512   %}
 7513   ins_pipe( pipe_slow );
 7514 %}
 7515 
 7516 instruct vucast(vec dst, vec src) %{
 7517   match(Set dst (VectorUCastB2X src));
 7518   match(Set dst (VectorUCastS2X src));
 7519   match(Set dst (VectorUCastI2X src));
 7520   format %{ "vector_ucast $dst,$src\t!" %}
 7521   ins_encode %{
 7522     assert(UseAVX > 0, "required");
 7523 
 7524     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7525     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7526     int vlen_enc = vector_length_encoding(this);
 7527     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7528   %}
 7529   ins_pipe( pipe_slow );
 7530 %}
 7531 
 7532 #ifdef _LP64
 7533 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7534   predicate(!VM_Version::supports_avx512vl() &&
 7535             Matcher::vector_length_in_bytes(n) < 64 &&
 7536             Matcher::vector_element_basic_type(n) == T_INT);
 7537   match(Set dst (RoundVF src));
 7538   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7539   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7540   ins_encode %{
 7541     int vlen_enc = vector_length_encoding(this);
 7542     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7543     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7544                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7545                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7546   %}
 7547   ins_pipe( pipe_slow );
 7548 %}
 7549 
 7550 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7551   predicate((VM_Version::supports_avx512vl() ||
 7552              Matcher::vector_length_in_bytes(n) == 64) &&
 7553              Matcher::vector_element_basic_type(n) == T_INT);
 7554   match(Set dst (RoundVF src));
 7555   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7556   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7557   ins_encode %{
 7558     int vlen_enc = vector_length_encoding(this);
 7559     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7560     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7561                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7562                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7563   %}
 7564   ins_pipe( pipe_slow );
 7565 %}
 7566 
 7567 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7568   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7569   match(Set dst (RoundVD src));
 7570   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7571   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7572   ins_encode %{
 7573     int vlen_enc = vector_length_encoding(this);
 7574     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7575     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7576                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7577                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7578   %}
 7579   ins_pipe( pipe_slow );
 7580 %}
 7581 
 7582 #endif // _LP64
 7583 
 7584 // --------------------------------- VectorMaskCmp --------------------------------------
 7585 
 7586 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7587   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7588             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7589             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7590             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7591   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7592   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7593   ins_encode %{
 7594     int vlen_enc = vector_length_encoding(this, $src1);
 7595     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7596     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7597       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7598     } else {
 7599       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7600     }
 7601   %}
 7602   ins_pipe( pipe_slow );
 7603 %}
 7604 
 7605 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7606   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7607             n->bottom_type()->isa_vectmask() == NULL &&
 7608             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7609   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7610   effect(TEMP ktmp);
 7611   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7612   ins_encode %{
 7613     int vlen_enc = Assembler::AVX_512bit;
 7614     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7615     KRegister mask = k0; // The comparison itself is not being masked.
 7616     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7617       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7618       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7619     } else {
 7620       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7621       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7622     }
 7623   %}
 7624   ins_pipe( pipe_slow );
 7625 %}
 7626 
 7627 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7628   predicate(n->bottom_type()->isa_vectmask() &&
 7629             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7630   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7631   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7632   ins_encode %{
 7633     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7634     int vlen_enc = vector_length_encoding(this, $src1);
 7635     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7636     KRegister mask = k0; // The comparison itself is not being masked.
 7637     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7638       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7639     } else {
 7640       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7641     }
 7642   %}
 7643   ins_pipe( pipe_slow );
 7644 %}
 7645 
 7646 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7647   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7648             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7649             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7650             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7651             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7652             (n->in(2)->get_int() == BoolTest::eq ||
 7653              n->in(2)->get_int() == BoolTest::lt ||
 7654              n->in(2)->get_int() == BoolTest::gt)); // cond
 7655   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7656   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7657   ins_encode %{
 7658     int vlen_enc = vector_length_encoding(this, $src1);
 7659     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7660     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7661     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7662   %}
 7663   ins_pipe( pipe_slow );
 7664 %}
 7665 
 7666 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7667   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7668             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7669             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7670             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7671             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7672             (n->in(2)->get_int() == BoolTest::ne ||
 7673              n->in(2)->get_int() == BoolTest::le ||
 7674              n->in(2)->get_int() == BoolTest::ge)); // cond
 7675   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7676   effect(TEMP dst, TEMP xtmp);
 7677   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7678   ins_encode %{
 7679     int vlen_enc = vector_length_encoding(this, $src1);
 7680     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7681     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7682     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7683   %}
 7684   ins_pipe( pipe_slow );
 7685 %}
 7686 
 7687 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7688   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7689             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7690             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7691             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7692             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7693   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7694   effect(TEMP dst, TEMP xtmp);
 7695   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7696   ins_encode %{
 7697     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7698     int vlen_enc = vector_length_encoding(this, $src1);
 7699     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7700     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7701 
 7702     if (vlen_enc == Assembler::AVX_128bit) {
 7703       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7704     } else {
 7705       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7706     }
 7707     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7708     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7709     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7710   %}
 7711   ins_pipe( pipe_slow );
 7712 %}
 7713 
 7714 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7715   predicate((n->bottom_type()->isa_vectmask() == NULL &&
 7716              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7717              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7718   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7719   effect(TEMP ktmp);
 7720   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7721   ins_encode %{
 7722     assert(UseAVX > 2, "required");
 7723 
 7724     int vlen_enc = vector_length_encoding(this, $src1);
 7725     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7726     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7727     KRegister mask = k0; // The comparison itself is not being masked.
 7728     bool merge = false;
 7729     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7730 
 7731     switch (src1_elem_bt) {
 7732       case T_INT: {
 7733         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7734         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7735         break;
 7736       }
 7737       case T_LONG: {
 7738         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7739         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7740         break;
 7741       }
 7742       default: assert(false, "%s", type2name(src1_elem_bt));
 7743     }
 7744   %}
 7745   ins_pipe( pipe_slow );
 7746 %}
 7747 
 7748 
 7749 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7750   predicate(n->bottom_type()->isa_vectmask() &&
 7751             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7752   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7753   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7754   ins_encode %{
 7755     assert(UseAVX > 2, "required");
 7756     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7757 
 7758     int vlen_enc = vector_length_encoding(this, $src1);
 7759     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7760     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7761     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7762 
 7763     // Comparison i
 7764     switch (src1_elem_bt) {
 7765       case T_BYTE: {
 7766         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7767         break;
 7768       }
 7769       case T_SHORT: {
 7770         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7771         break;
 7772       }
 7773       case T_INT: {
 7774         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7775         break;
 7776       }
 7777       case T_LONG: {
 7778         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7779         break;
 7780       }
 7781       default: assert(false, "%s", type2name(src1_elem_bt));
 7782     }
 7783   %}
 7784   ins_pipe( pipe_slow );
 7785 %}
 7786 
 7787 // Extract
 7788 
 7789 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7790   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7791   match(Set dst (ExtractI src idx));
 7792   match(Set dst (ExtractS src idx));
 7793 #ifdef _LP64
 7794   match(Set dst (ExtractB src idx));
 7795 #endif
 7796   format %{ "extractI $dst,$src,$idx\t!" %}
 7797   ins_encode %{
 7798     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7799 
 7800     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7801     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7802   %}
 7803   ins_pipe( pipe_slow );
 7804 %}
 7805 
 7806 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7807   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7808             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7809   match(Set dst (ExtractI src idx));
 7810   match(Set dst (ExtractS src idx));
 7811 #ifdef _LP64
 7812   match(Set dst (ExtractB src idx));
 7813 #endif
 7814   effect(TEMP vtmp);
 7815   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7816   ins_encode %{
 7817     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7818 
 7819     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7820     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7821     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7822   %}
 7823   ins_pipe( pipe_slow );
 7824 %}
 7825 
 7826 #ifdef _LP64
 7827 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7828   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7829   match(Set dst (ExtractL src idx));
 7830   format %{ "extractL $dst,$src,$idx\t!" %}
 7831   ins_encode %{
 7832     assert(UseSSE >= 4, "required");
 7833     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7834 
 7835     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7836   %}
 7837   ins_pipe( pipe_slow );
 7838 %}
 7839 
 7840 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7841   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7842             Matcher::vector_length(n->in(1)) == 8);  // src
 7843   match(Set dst (ExtractL src idx));
 7844   effect(TEMP vtmp);
 7845   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7846   ins_encode %{
 7847     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7848 
 7849     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7850     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7851   %}
 7852   ins_pipe( pipe_slow );
 7853 %}
 7854 #endif
 7855 
 7856 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7857   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7858   match(Set dst (ExtractF src idx));
 7859   effect(TEMP dst, TEMP vtmp);
 7860   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7861   ins_encode %{
 7862     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7863 
 7864     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7865   %}
 7866   ins_pipe( pipe_slow );
 7867 %}
 7868 
 7869 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7870   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7871             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7872   match(Set dst (ExtractF src idx));
 7873   effect(TEMP vtmp);
 7874   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7875   ins_encode %{
 7876     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7877 
 7878     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7879     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7880   %}
 7881   ins_pipe( pipe_slow );
 7882 %}
 7883 
 7884 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7885   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7886   match(Set dst (ExtractD src idx));
 7887   format %{ "extractD $dst,$src,$idx\t!" %}
 7888   ins_encode %{
 7889     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7890 
 7891     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7892   %}
 7893   ins_pipe( pipe_slow );
 7894 %}
 7895 
 7896 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7897   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7898             Matcher::vector_length(n->in(1)) == 8);  // src
 7899   match(Set dst (ExtractD src idx));
 7900   effect(TEMP vtmp);
 7901   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7902   ins_encode %{
 7903     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7904 
 7905     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7906     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7907   %}
 7908   ins_pipe( pipe_slow );
 7909 %}
 7910 
 7911 // --------------------------------- Vector Blend --------------------------------------
 7912 
 7913 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7914   predicate(UseAVX == 0);
 7915   match(Set dst (VectorBlend (Binary dst src) mask));
 7916   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7917   effect(TEMP tmp);
 7918   ins_encode %{
 7919     assert(UseSSE >= 4, "required");
 7920 
 7921     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7922       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7923     }
 7924     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7925   %}
 7926   ins_pipe( pipe_slow );
 7927 %}
 7928 
 7929 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7930   predicate(UseAVX > 0 &&
 7931             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7932             Matcher::vector_length_in_bytes(n) <= 32 &&
 7933             is_integral_type(Matcher::vector_element_basic_type(n)));
 7934   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7935   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7936   ins_encode %{
 7937     int vlen_enc = vector_length_encoding(this);
 7938     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7939   %}
 7940   ins_pipe( pipe_slow );
 7941 %}
 7942 
 7943 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7944   predicate(UseAVX > 0 &&
 7945             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7946             Matcher::vector_length_in_bytes(n) <= 32 &&
 7947             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7948   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7949   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7950   ins_encode %{
 7951     int vlen_enc = vector_length_encoding(this);
 7952     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7953   %}
 7954   ins_pipe( pipe_slow );
 7955 %}
 7956 
 7957 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7958   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7959             n->in(2)->bottom_type()->isa_vectmask() == NULL);
 7960   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7961   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7962   effect(TEMP ktmp);
 7963   ins_encode %{
 7964      int vlen_enc = Assembler::AVX_512bit;
 7965      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7966     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7967     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7968   %}
 7969   ins_pipe( pipe_slow );
 7970 %}
 7971 
 7972 
 7973 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7974   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7975             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7976              VM_Version::supports_avx512bw()));
 7977   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7978   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7979   ins_encode %{
 7980     int vlen_enc = vector_length_encoding(this);
 7981     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7982     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7983   %}
 7984   ins_pipe( pipe_slow );
 7985 %}
 7986 
 7987 // --------------------------------- ABS --------------------------------------
 7988 // a = |a|
 7989 instruct vabsB_reg(vec dst, vec src) %{
 7990   match(Set dst (AbsVB  src));
 7991   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7992   ins_encode %{
 7993     uint vlen = Matcher::vector_length(this);
 7994     if (vlen <= 16) {
 7995       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7996     } else {
 7997       int vlen_enc = vector_length_encoding(this);
 7998       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7999     }
 8000   %}
 8001   ins_pipe( pipe_slow );
 8002 %}
 8003 
 8004 instruct vabsS_reg(vec dst, vec src) %{
 8005   match(Set dst (AbsVS  src));
 8006   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8007   ins_encode %{
 8008     uint vlen = Matcher::vector_length(this);
 8009     if (vlen <= 8) {
 8010       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8011     } else {
 8012       int vlen_enc = vector_length_encoding(this);
 8013       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8014     }
 8015   %}
 8016   ins_pipe( pipe_slow );
 8017 %}
 8018 
 8019 instruct vabsI_reg(vec dst, vec src) %{
 8020   match(Set dst (AbsVI  src));
 8021   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8022   ins_encode %{
 8023     uint vlen = Matcher::vector_length(this);
 8024     if (vlen <= 4) {
 8025       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8026     } else {
 8027       int vlen_enc = vector_length_encoding(this);
 8028       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8029     }
 8030   %}
 8031   ins_pipe( pipe_slow );
 8032 %}
 8033 
 8034 instruct vabsL_reg(vec dst, vec src) %{
 8035   match(Set dst (AbsVL  src));
 8036   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8037   ins_encode %{
 8038     assert(UseAVX > 2, "required");
 8039     int vlen_enc = vector_length_encoding(this);
 8040     if (!VM_Version::supports_avx512vl()) {
 8041       vlen_enc = Assembler::AVX_512bit;
 8042     }
 8043     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8044   %}
 8045   ins_pipe( pipe_slow );
 8046 %}
 8047 
 8048 // --------------------------------- ABSNEG --------------------------------------
 8049 
 8050 instruct vabsnegF(vec dst, vec src) %{
 8051   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8052   match(Set dst (AbsVF src));
 8053   match(Set dst (NegVF src));
 8054   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8055   ins_cost(150);
 8056   ins_encode %{
 8057     int opcode = this->ideal_Opcode();
 8058     int vlen = Matcher::vector_length(this);
 8059     if (vlen == 2) {
 8060       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8061     } else {
 8062       assert(vlen == 8 || vlen == 16, "required");
 8063       int vlen_enc = vector_length_encoding(this);
 8064       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8065     }
 8066   %}
 8067   ins_pipe( pipe_slow );
 8068 %}
 8069 
 8070 instruct vabsneg4F(vec dst) %{
 8071   predicate(Matcher::vector_length(n) == 4);
 8072   match(Set dst (AbsVF dst));
 8073   match(Set dst (NegVF dst));
 8074   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8075   ins_cost(150);
 8076   ins_encode %{
 8077     int opcode = this->ideal_Opcode();
 8078     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8079   %}
 8080   ins_pipe( pipe_slow );
 8081 %}
 8082 
 8083 instruct vabsnegD(vec dst, vec src) %{
 8084   match(Set dst (AbsVD  src));
 8085   match(Set dst (NegVD  src));
 8086   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8087   ins_encode %{
 8088     int opcode = this->ideal_Opcode();
 8089     uint vlen = Matcher::vector_length(this);
 8090     if (vlen == 2) {
 8091       assert(UseSSE >= 2, "required");
 8092       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8093     } else {
 8094       int vlen_enc = vector_length_encoding(this);
 8095       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8096     }
 8097   %}
 8098   ins_pipe( pipe_slow );
 8099 %}
 8100 
 8101 //------------------------------------- VectorTest --------------------------------------------
 8102 
 8103 #ifdef _LP64
 8104 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8105   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8106   match(Set cr (VectorTest src1 src2));
 8107   effect(TEMP vtmp);
 8108   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8109   ins_encode %{
 8110     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8111     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8112     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8113   %}
 8114   ins_pipe( pipe_slow );
 8115 %}
 8116 
 8117 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8118   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8119   match(Set cr (VectorTest src1 src2));
 8120   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8121   ins_encode %{
 8122     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8123     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8124     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8125   %}
 8126   ins_pipe( pipe_slow );
 8127 %}
 8128 
 8129 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8130   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8131              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8132             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8133   match(Set cr (VectorTest src1 src2));
 8134   effect(TEMP tmp);
 8135   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8136   ins_encode %{
 8137     uint masklen = Matcher::vector_length(this, $src1);
 8138     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8139     __ andl($tmp$$Register, (1 << masklen) - 1);
 8140     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8141   %}
 8142   ins_pipe( pipe_slow );
 8143 %}
 8144 
 8145 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8146   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8147              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8148             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8149   match(Set cr (VectorTest src1 src2));
 8150   effect(TEMP tmp);
 8151   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8152   ins_encode %{
 8153     uint masklen = Matcher::vector_length(this, $src1);
 8154     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8155     __ andl($tmp$$Register, (1 << masklen) - 1);
 8156   %}
 8157   ins_pipe( pipe_slow );
 8158 %}
 8159 
 8160 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8161   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8162             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8163   match(Set cr (VectorTest src1 src2));
 8164   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8165   ins_encode %{
 8166     uint masklen = Matcher::vector_length(this, $src1);
 8167     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8168   %}
 8169   ins_pipe( pipe_slow );
 8170 %}
 8171 #endif
 8172 
 8173 //------------------------------------- LoadMask --------------------------------------------
 8174 
 8175 instruct loadMask(legVec dst, legVec src) %{
 8176   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
 8177   match(Set dst (VectorLoadMask src));
 8178   effect(TEMP dst);
 8179   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8180   ins_encode %{
 8181     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8182     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8183     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8184   %}
 8185   ins_pipe( pipe_slow );
 8186 %}
 8187 
 8188 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8189   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8190   match(Set dst (VectorLoadMask src));
 8191   effect(TEMP xtmp);
 8192   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8193   ins_encode %{
 8194     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8195                         true, Assembler::AVX_512bit);
 8196   %}
 8197   ins_pipe( pipe_slow );
 8198 %}
 8199 
 8200 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8201   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8202   match(Set dst (VectorLoadMask src));
 8203   effect(TEMP xtmp);
 8204   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8205   ins_encode %{
 8206     int vlen_enc = vector_length_encoding(in(1));
 8207     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8208                         false, vlen_enc);
 8209   %}
 8210   ins_pipe( pipe_slow );
 8211 %}
 8212 
 8213 //------------------------------------- StoreMask --------------------------------------------
 8214 
 8215 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8216   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8217   match(Set dst (VectorStoreMask src size));
 8218   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8219   ins_encode %{
 8220     int vlen = Matcher::vector_length(this);
 8221     if (vlen <= 16 && UseAVX <= 2) {
 8222       assert(UseSSE >= 3, "required");
 8223       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8224     } else {
 8225       assert(UseAVX > 0, "required");
 8226       int src_vlen_enc = vector_length_encoding(this, $src);
 8227       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8228     }
 8229   %}
 8230   ins_pipe( pipe_slow );
 8231 %}
 8232 
 8233 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8234   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8235   match(Set dst (VectorStoreMask src size));
 8236   effect(TEMP_DEF dst, TEMP xtmp);
 8237   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8238   ins_encode %{
 8239     int vlen_enc = Assembler::AVX_128bit;
 8240     int vlen = Matcher::vector_length(this);
 8241     if (vlen <= 8) {
 8242       assert(UseSSE >= 3, "required");
 8243       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8244       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8245       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8246     } else {
 8247       assert(UseAVX > 0, "required");
 8248       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8249       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8250       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8251     }
 8252   %}
 8253   ins_pipe( pipe_slow );
 8254 %}
 8255 
 8256 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8257   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8258   match(Set dst (VectorStoreMask src size));
 8259   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8260   effect(TEMP_DEF dst, TEMP xtmp);
 8261   ins_encode %{
 8262     int vlen_enc = Assembler::AVX_128bit;
 8263     int vlen = Matcher::vector_length(this);
 8264     if (vlen <= 4) {
 8265       assert(UseSSE >= 3, "required");
 8266       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8267       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8268       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8269       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8270     } else {
 8271       assert(UseAVX > 0, "required");
 8272       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8273       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8274       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8275       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8276       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8277     }
 8278   %}
 8279   ins_pipe( pipe_slow );
 8280 %}
 8281 
 8282 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8283   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8284   match(Set dst (VectorStoreMask src size));
 8285   effect(TEMP_DEF dst, TEMP xtmp);
 8286   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8287   ins_encode %{
 8288     assert(UseSSE >= 3, "required");
 8289     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8290     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8291     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8292     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8293     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8294   %}
 8295   ins_pipe( pipe_slow );
 8296 %}
 8297 
 8298 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8299   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8300   match(Set dst (VectorStoreMask src size));
 8301   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8302   effect(TEMP_DEF dst, TEMP vtmp);
 8303   ins_encode %{
 8304     int vlen_enc = Assembler::AVX_128bit;
 8305     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8306     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8307     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8308     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8309     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8310     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8311     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8312   %}
 8313   ins_pipe( pipe_slow );
 8314 %}
 8315 
 8316 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8317   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8318   match(Set dst (VectorStoreMask src size));
 8319   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8320   ins_encode %{
 8321     int src_vlen_enc = vector_length_encoding(this, $src);
 8322     int dst_vlen_enc = vector_length_encoding(this);
 8323     if (!VM_Version::supports_avx512vl()) {
 8324       src_vlen_enc = Assembler::AVX_512bit;
 8325     }
 8326     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8327     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8328   %}
 8329   ins_pipe( pipe_slow );
 8330 %}
 8331 
 8332 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8333   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8334   match(Set dst (VectorStoreMask src size));
 8335   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8336   ins_encode %{
 8337     int src_vlen_enc = vector_length_encoding(this, $src);
 8338     int dst_vlen_enc = vector_length_encoding(this);
 8339     if (!VM_Version::supports_avx512vl()) {
 8340       src_vlen_enc = Assembler::AVX_512bit;
 8341     }
 8342     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8343     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8344   %}
 8345   ins_pipe( pipe_slow );
 8346 %}
 8347 
 8348 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8349   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8350   match(Set dst (VectorStoreMask mask size));
 8351   effect(TEMP_DEF dst);
 8352   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8353   ins_encode %{
 8354     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8355     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8356                  false, Assembler::AVX_512bit, noreg);
 8357     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8358   %}
 8359   ins_pipe( pipe_slow );
 8360 %}
 8361 
 8362 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8363   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8364   match(Set dst (VectorStoreMask mask size));
 8365   effect(TEMP_DEF dst);
 8366   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8367   ins_encode %{
 8368     int dst_vlen_enc = vector_length_encoding(this);
 8369     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8370     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8371   %}
 8372   ins_pipe( pipe_slow );
 8373 %}
 8374 
 8375 instruct vmaskcast_evex(kReg dst) %{
 8376   match(Set dst (VectorMaskCast dst));
 8377   ins_cost(0);
 8378   format %{ "vector_mask_cast $dst" %}
 8379   ins_encode %{
 8380     // empty
 8381   %}
 8382   ins_pipe(empty);
 8383 %}
 8384 
 8385 instruct vmaskcast(vec dst) %{
 8386   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8387   match(Set dst (VectorMaskCast dst));
 8388   ins_cost(0);
 8389   format %{ "vector_mask_cast $dst" %}
 8390   ins_encode %{
 8391     // empty
 8392   %}
 8393   ins_pipe(empty);
 8394 %}
 8395 
 8396 instruct vmaskcast_avx(vec dst, vec src) %{
 8397   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8398   match(Set dst (VectorMaskCast src));
 8399   format %{ "vector_mask_cast $dst, $src" %}
 8400   ins_encode %{
 8401     int vlen = Matcher::vector_length(this);
 8402     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8403     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8404     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8405   %}
 8406   ins_pipe(pipe_slow);
 8407 %}
 8408 
 8409 //-------------------------------- Load Iota Indices ----------------------------------
 8410 
 8411 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8412   match(Set dst (VectorLoadConst src));
 8413   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8414   ins_encode %{
 8415      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8416      BasicType bt = Matcher::vector_element_basic_type(this);
 8417      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8418   %}
 8419   ins_pipe( pipe_slow );
 8420 %}
 8421 
 8422 #ifdef _LP64
 8423 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8424   match(Set dst (PopulateIndex src1 src2));
 8425   effect(TEMP dst, TEMP vtmp);
 8426   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8427   ins_encode %{
 8428      assert($src2$$constant == 1, "required");
 8429      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8430      int vlen_enc = vector_length_encoding(this);
 8431      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8432      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8433      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8434      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8435   %}
 8436   ins_pipe( pipe_slow );
 8437 %}
 8438 
 8439 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8440   match(Set dst (PopulateIndex src1 src2));
 8441   effect(TEMP dst, TEMP vtmp);
 8442   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8443   ins_encode %{
 8444      assert($src2$$constant == 1, "required");
 8445      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8446      int vlen_enc = vector_length_encoding(this);
 8447      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8448      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8449      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8450      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8451   %}
 8452   ins_pipe( pipe_slow );
 8453 %}
 8454 #endif
 8455 //-------------------------------- Rearrange ----------------------------------
 8456 
 8457 // LoadShuffle/Rearrange for Byte
 8458 instruct rearrangeB(vec dst, vec shuffle) %{
 8459   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8460             Matcher::vector_length(n) < 32);
 8461   match(Set dst (VectorRearrange dst shuffle));
 8462   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8463   ins_encode %{
 8464     assert(UseSSE >= 4, "required");
 8465     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8466   %}
 8467   ins_pipe( pipe_slow );
 8468 %}
 8469 
 8470 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8471   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8472             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8473   match(Set dst (VectorRearrange src shuffle));
 8474   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8475   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8476   ins_encode %{
 8477     assert(UseAVX >= 2, "required");
 8478     // Swap src into vtmp1
 8479     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8480     // Shuffle swapped src to get entries from other 128 bit lane
 8481     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8482     // Shuffle original src to get entries from self 128 bit lane
 8483     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8484     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8485     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8486     // Perform the blend
 8487     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8488   %}
 8489   ins_pipe( pipe_slow );
 8490 %}
 8491 
 8492 
 8493 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8494   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8495             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8496   match(Set dst (VectorRearrange src shuffle));
 8497   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8498   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8499   ins_encode %{
 8500     int vlen_enc = vector_length_encoding(this);
 8501     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8502                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8503                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8504   %}
 8505   ins_pipe( pipe_slow );
 8506 %}
 8507 
 8508 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8509   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8510             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8511   match(Set dst (VectorRearrange src shuffle));
 8512   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8513   ins_encode %{
 8514     int vlen_enc = vector_length_encoding(this);
 8515     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8516   %}
 8517   ins_pipe( pipe_slow );
 8518 %}
 8519 
 8520 // LoadShuffle/Rearrange for Short
 8521 
 8522 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8523   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8524             !VM_Version::supports_avx512bw());
 8525   match(Set dst (VectorLoadShuffle src));
 8526   effect(TEMP dst, TEMP vtmp);
 8527   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8528   ins_encode %{
 8529     // Create a byte shuffle mask from short shuffle mask
 8530     // only byte shuffle instruction available on these platforms
 8531     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8532     if (UseAVX == 0) {
 8533       assert(vlen_in_bytes <= 16, "required");
 8534       // Multiply each shuffle by two to get byte index
 8535       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8536       __ psllw($vtmp$$XMMRegister, 1);
 8537 
 8538       // Duplicate to create 2 copies of byte index
 8539       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8540       __ psllw($dst$$XMMRegister, 8);
 8541       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8542 
 8543       // Add one to get alternate byte index
 8544       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8545       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8546     } else {
 8547       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8548       int vlen_enc = vector_length_encoding(this);
 8549       // Multiply each shuffle by two to get byte index
 8550       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8551 
 8552       // Duplicate to create 2 copies of byte index
 8553       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8554       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8555 
 8556       // Add one to get alternate byte index
 8557       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8558     }
 8559   %}
 8560   ins_pipe( pipe_slow );
 8561 %}
 8562 
 8563 instruct rearrangeS(vec dst, vec shuffle) %{
 8564   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8565             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8566   match(Set dst (VectorRearrange dst shuffle));
 8567   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8568   ins_encode %{
 8569     assert(UseSSE >= 4, "required");
 8570     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8571   %}
 8572   ins_pipe( pipe_slow );
 8573 %}
 8574 
 8575 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8576   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8577             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8578   match(Set dst (VectorRearrange src shuffle));
 8579   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8580   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8581   ins_encode %{
 8582     assert(UseAVX >= 2, "required");
 8583     // Swap src into vtmp1
 8584     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8585     // Shuffle swapped src to get entries from other 128 bit lane
 8586     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8587     // Shuffle original src to get entries from self 128 bit lane
 8588     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8589     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8590     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8591     // Perform the blend
 8592     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8593   %}
 8594   ins_pipe( pipe_slow );
 8595 %}
 8596 
 8597 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8598   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8599             VM_Version::supports_avx512bw());
 8600   match(Set dst (VectorRearrange src shuffle));
 8601   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8602   ins_encode %{
 8603     int vlen_enc = vector_length_encoding(this);
 8604     if (!VM_Version::supports_avx512vl()) {
 8605       vlen_enc = Assembler::AVX_512bit;
 8606     }
 8607     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8608   %}
 8609   ins_pipe( pipe_slow );
 8610 %}
 8611 
 8612 // LoadShuffle/Rearrange for Integer and Float
 8613 
 8614 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8615   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8616             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8617   match(Set dst (VectorLoadShuffle src));
 8618   effect(TEMP dst, TEMP vtmp);
 8619   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8620   ins_encode %{
 8621     assert(UseSSE >= 4, "required");
 8622 
 8623     // Create a byte shuffle mask from int shuffle mask
 8624     // only byte shuffle instruction available on these platforms
 8625 
 8626     // Duplicate and multiply each shuffle by 4
 8627     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8628     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8629     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8630     __ psllw($vtmp$$XMMRegister, 2);
 8631 
 8632     // Duplicate again to create 4 copies of byte index
 8633     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8634     __ psllw($dst$$XMMRegister, 8);
 8635     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8636 
 8637     // Add 3,2,1,0 to get alternate byte index
 8638     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8639     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8640   %}
 8641   ins_pipe( pipe_slow );
 8642 %}
 8643 
 8644 instruct rearrangeI(vec dst, vec shuffle) %{
 8645   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8646             UseAVX == 0);
 8647   match(Set dst (VectorRearrange dst shuffle));
 8648   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8649   ins_encode %{
 8650     assert(UseSSE >= 4, "required");
 8651     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8652   %}
 8653   ins_pipe( pipe_slow );
 8654 %}
 8655 
 8656 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8657   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8658             UseAVX > 0);
 8659   match(Set dst (VectorRearrange src shuffle));
 8660   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8661   ins_encode %{
 8662     int vlen_enc = vector_length_encoding(this);
 8663     BasicType bt = Matcher::vector_element_basic_type(this);
 8664     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8665   %}
 8666   ins_pipe( pipe_slow );
 8667 %}
 8668 
 8669 // LoadShuffle/Rearrange for Long and Double
 8670 
 8671 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8672   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8673             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8674   match(Set dst (VectorLoadShuffle src));
 8675   effect(TEMP dst, TEMP vtmp);
 8676   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8677   ins_encode %{
 8678     assert(UseAVX >= 2, "required");
 8679 
 8680     int vlen_enc = vector_length_encoding(this);
 8681     // Create a double word shuffle mask from long shuffle mask
 8682     // only double word shuffle instruction available on these platforms
 8683 
 8684     // Multiply each shuffle by two to get double word index
 8685     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8686 
 8687     // Duplicate each double word shuffle
 8688     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8689     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8690 
 8691     // Add one to get alternate double word index
 8692     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8693   %}
 8694   ins_pipe( pipe_slow );
 8695 %}
 8696 
 8697 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8698   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8699             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8700   match(Set dst (VectorRearrange src shuffle));
 8701   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8702   ins_encode %{
 8703     assert(UseAVX >= 2, "required");
 8704 
 8705     int vlen_enc = vector_length_encoding(this);
 8706     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8707   %}
 8708   ins_pipe( pipe_slow );
 8709 %}
 8710 
 8711 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8712   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8713             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8714   match(Set dst (VectorRearrange src shuffle));
 8715   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8716   ins_encode %{
 8717     assert(UseAVX > 2, "required");
 8718 
 8719     int vlen_enc = vector_length_encoding(this);
 8720     if (vlen_enc == Assembler::AVX_128bit) {
 8721       vlen_enc = Assembler::AVX_256bit;
 8722     }
 8723     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8724   %}
 8725   ins_pipe( pipe_slow );
 8726 %}
 8727 
 8728 // --------------------------------- FMA --------------------------------------
 8729 // a * b + c
 8730 
 8731 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8732   match(Set c (FmaVF  c (Binary a b)));
 8733   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8734   ins_cost(150);
 8735   ins_encode %{
 8736     assert(UseFMA, "not enabled");
 8737     int vlen_enc = vector_length_encoding(this);
 8738     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8739   %}
 8740   ins_pipe( pipe_slow );
 8741 %}
 8742 
 8743 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8744   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8745   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8746   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8747   ins_cost(150);
 8748   ins_encode %{
 8749     assert(UseFMA, "not enabled");
 8750     int vlen_enc = vector_length_encoding(this);
 8751     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8752   %}
 8753   ins_pipe( pipe_slow );
 8754 %}
 8755 
 8756 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8757   match(Set c (FmaVD  c (Binary a b)));
 8758   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8759   ins_cost(150);
 8760   ins_encode %{
 8761     assert(UseFMA, "not enabled");
 8762     int vlen_enc = vector_length_encoding(this);
 8763     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8764   %}
 8765   ins_pipe( pipe_slow );
 8766 %}
 8767 
 8768 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8769   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8770   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8771   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8772   ins_cost(150);
 8773   ins_encode %{
 8774     assert(UseFMA, "not enabled");
 8775     int vlen_enc = vector_length_encoding(this);
 8776     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8777   %}
 8778   ins_pipe( pipe_slow );
 8779 %}
 8780 
 8781 // --------------------------------- Vector Multiply Add --------------------------------------
 8782 
 8783 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8784   predicate(UseAVX == 0);
 8785   match(Set dst (MulAddVS2VI dst src1));
 8786   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8787   ins_encode %{
 8788     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8789   %}
 8790   ins_pipe( pipe_slow );
 8791 %}
 8792 
 8793 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8794   predicate(UseAVX > 0);
 8795   match(Set dst (MulAddVS2VI src1 src2));
 8796   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8797   ins_encode %{
 8798     int vlen_enc = vector_length_encoding(this);
 8799     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8800   %}
 8801   ins_pipe( pipe_slow );
 8802 %}
 8803 
 8804 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8805 
 8806 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8807   predicate(VM_Version::supports_avx512_vnni());
 8808   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8809   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8810   ins_encode %{
 8811     assert(UseAVX > 2, "required");
 8812     int vlen_enc = vector_length_encoding(this);
 8813     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8814   %}
 8815   ins_pipe( pipe_slow );
 8816   ins_cost(10);
 8817 %}
 8818 
 8819 // --------------------------------- PopCount --------------------------------------
 8820 
 8821 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8822   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8823   match(Set dst (PopCountVI src));
 8824   match(Set dst (PopCountVL src));
 8825   format %{ "vector_popcount_integral $dst, $src" %}
 8826   ins_encode %{
 8827     int opcode = this->ideal_Opcode();
 8828     int vlen_enc = vector_length_encoding(this, $src);
 8829     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8830     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8831   %}
 8832   ins_pipe( pipe_slow );
 8833 %}
 8834 
 8835 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8836   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8837   match(Set dst (PopCountVI src mask));
 8838   match(Set dst (PopCountVL src mask));
 8839   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8840   ins_encode %{
 8841     int vlen_enc = vector_length_encoding(this, $src);
 8842     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8843     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8844     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8845   %}
 8846   ins_pipe( pipe_slow );
 8847 %}
 8848 
 8849 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8850   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8851   match(Set dst (PopCountVI src));
 8852   match(Set dst (PopCountVL src));
 8853   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8854   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8855   ins_encode %{
 8856     int opcode = this->ideal_Opcode();
 8857     int vlen_enc = vector_length_encoding(this, $src);
 8858     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8859     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8860                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8861   %}
 8862   ins_pipe( pipe_slow );
 8863 %}
 8864 
 8865 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8866 
 8867 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8868   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8869                                               Matcher::vector_length_in_bytes(n->in(1))));
 8870   match(Set dst (CountTrailingZerosV src));
 8871   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8872   ins_cost(400);
 8873   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8874   ins_encode %{
 8875     int vlen_enc = vector_length_encoding(this, $src);
 8876     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8877     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8878                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8879   %}
 8880   ins_pipe( pipe_slow );
 8881 %}
 8882 
 8883 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8884   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8885             VM_Version::supports_avx512cd() &&
 8886             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8887   match(Set dst (CountTrailingZerosV src));
 8888   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8889   ins_cost(400);
 8890   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8891   ins_encode %{
 8892     int vlen_enc = vector_length_encoding(this, $src);
 8893     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8894     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8895                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8896   %}
 8897   ins_pipe( pipe_slow );
 8898 %}
 8899 
 8900 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8901   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8902   match(Set dst (CountTrailingZerosV src));
 8903   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8904   ins_cost(400);
 8905   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8906   ins_encode %{
 8907     int vlen_enc = vector_length_encoding(this, $src);
 8908     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8909     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8910                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8911                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8912   %}
 8913   ins_pipe( pipe_slow );
 8914 %}
 8915 
 8916 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8917   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8918   match(Set dst (CountTrailingZerosV src));
 8919   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8920   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8921   ins_encode %{
 8922     int vlen_enc = vector_length_encoding(this, $src);
 8923     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8924     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8925                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8926   %}
 8927   ins_pipe( pipe_slow );
 8928 %}
 8929 
 8930 
 8931 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8932 
 8933 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8934   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8935   effect(TEMP dst);
 8936   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8937   ins_encode %{
 8938     int vector_len = vector_length_encoding(this);
 8939     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8940   %}
 8941   ins_pipe( pipe_slow );
 8942 %}
 8943 
 8944 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8945   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8946   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8947   effect(TEMP dst);
 8948   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8949   ins_encode %{
 8950     int vector_len = vector_length_encoding(this);
 8951     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8952   %}
 8953   ins_pipe( pipe_slow );
 8954 %}
 8955 
 8956 // --------------------------------- Rotation Operations ----------------------------------
 8957 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8958   match(Set dst (RotateLeftV src shift));
 8959   match(Set dst (RotateRightV src shift));
 8960   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8961   ins_encode %{
 8962     int opcode      = this->ideal_Opcode();
 8963     int vector_len  = vector_length_encoding(this);
 8964     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8965     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 8966   %}
 8967   ins_pipe( pipe_slow );
 8968 %}
 8969 
 8970 instruct vprorate(vec dst, vec src, vec shift) %{
 8971   match(Set dst (RotateLeftV src shift));
 8972   match(Set dst (RotateRightV src shift));
 8973   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 8974   ins_encode %{
 8975     int opcode      = this->ideal_Opcode();
 8976     int vector_len  = vector_length_encoding(this);
 8977     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8978     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 8979   %}
 8980   ins_pipe( pipe_slow );
 8981 %}
 8982 
 8983 // ---------------------------------- Masked Operations ------------------------------------
 8984 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 8985   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 8986   match(Set dst (LoadVectorMasked mem mask));
 8987   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8988   ins_encode %{
 8989     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 8990     int vlen_enc = vector_length_encoding(this);
 8991     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 8992   %}
 8993   ins_pipe( pipe_slow );
 8994 %}
 8995 
 8996 
 8997 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 8998   predicate(n->in(3)->bottom_type()->isa_vectmask());
 8999   match(Set dst (LoadVectorMasked mem mask));
 9000   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9001   ins_encode %{
 9002     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9003     int vector_len = vector_length_encoding(this);
 9004     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9005   %}
 9006   ins_pipe( pipe_slow );
 9007 %}
 9008 
 9009 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9010   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9011   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9012   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9013   ins_encode %{
 9014     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9015     int vlen_enc = vector_length_encoding(src_node);
 9016     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9017     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9018   %}
 9019   ins_pipe( pipe_slow );
 9020 %}
 9021 
 9022 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9023   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9024   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9025   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9026   ins_encode %{
 9027     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9028     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9029     int vlen_enc = vector_length_encoding(src_node);
 9030     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9031   %}
 9032   ins_pipe( pipe_slow );
 9033 %}
 9034 
 9035 #ifdef _LP64
 9036 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9037   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9038   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9039   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9040   ins_encode %{
 9041     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9042     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9043 
 9044     Label DONE;
 9045     int vlen_enc = vector_length_encoding(this, $src1);
 9046     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9047 
 9048     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9049     __ mov64($dst$$Register, -1L);
 9050     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9051     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9052     __ jccb(Assembler::carrySet, DONE);
 9053     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9054     __ notq($dst$$Register);
 9055     __ tzcntq($dst$$Register, $dst$$Register);
 9056     __ bind(DONE);
 9057   %}
 9058   ins_pipe( pipe_slow );
 9059 %}
 9060 
 9061 
 9062 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
 9063   match(Set dst (VectorMaskGen len));
 9064   effect(TEMP temp);
 9065   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9066   ins_encode %{
 9067     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9068   %}
 9069   ins_pipe( pipe_slow );
 9070 %}
 9071 
 9072 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9073   match(Set dst (VectorMaskGen len));
 9074   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9075   effect(TEMP temp);
 9076   ins_encode %{
 9077     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9078     __ kmovql($dst$$KRegister, $temp$$Register);
 9079   %}
 9080   ins_pipe( pipe_slow );
 9081 %}
 9082 
 9083 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9084   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9085   match(Set dst (VectorMaskToLong mask));
 9086   effect(TEMP dst, KILL cr);
 9087   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9088   ins_encode %{
 9089     int opcode = this->ideal_Opcode();
 9090     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9091     int mask_len = Matcher::vector_length(this, $mask);
 9092     int mask_size = mask_len * type2aelembytes(mbt);
 9093     int vlen_enc = vector_length_encoding(this, $mask);
 9094     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9095                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9096   %}
 9097   ins_pipe( pipe_slow );
 9098 %}
 9099 
 9100 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9101   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9102   match(Set dst (VectorMaskToLong mask));
 9103   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9104   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9105   ins_encode %{
 9106     int opcode = this->ideal_Opcode();
 9107     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9108     int mask_len = Matcher::vector_length(this, $mask);
 9109     int vlen_enc = vector_length_encoding(this, $mask);
 9110     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9111                              $dst$$Register, mask_len, mbt, vlen_enc);
 9112   %}
 9113   ins_pipe( pipe_slow );
 9114 %}
 9115 
 9116 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9117   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9118   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9119   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9120   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9121   ins_encode %{
 9122     int opcode = this->ideal_Opcode();
 9123     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9124     int mask_len = Matcher::vector_length(this, $mask);
 9125     int vlen_enc = vector_length_encoding(this, $mask);
 9126     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9127                              $dst$$Register, mask_len, mbt, vlen_enc);
 9128   %}
 9129   ins_pipe( pipe_slow );
 9130 %}
 9131 
 9132 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9133   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9134   match(Set dst (VectorMaskTrueCount mask));
 9135   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9136   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9137   ins_encode %{
 9138     int opcode = this->ideal_Opcode();
 9139     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9140     int mask_len = Matcher::vector_length(this, $mask);
 9141     int mask_size = mask_len * type2aelembytes(mbt);
 9142     int vlen_enc = vector_length_encoding(this, $mask);
 9143     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9144                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9145   %}
 9146   ins_pipe( pipe_slow );
 9147 %}
 9148 
 9149 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9150   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9151   match(Set dst (VectorMaskTrueCount mask));
 9152   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9153   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9154   ins_encode %{
 9155     int opcode = this->ideal_Opcode();
 9156     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9157     int mask_len = Matcher::vector_length(this, $mask);
 9158     int vlen_enc = vector_length_encoding(this, $mask);
 9159     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9160                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9161   %}
 9162   ins_pipe( pipe_slow );
 9163 %}
 9164 
 9165 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9166   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9167   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9168   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9169   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9170   ins_encode %{
 9171     int opcode = this->ideal_Opcode();
 9172     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9173     int mask_len = Matcher::vector_length(this, $mask);
 9174     int vlen_enc = vector_length_encoding(this, $mask);
 9175     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9176                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9177   %}
 9178   ins_pipe( pipe_slow );
 9179 %}
 9180 
 9181 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9182   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9183   match(Set dst (VectorMaskFirstTrue mask));
 9184   match(Set dst (VectorMaskLastTrue mask));
 9185   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9186   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9187   ins_encode %{
 9188     int opcode = this->ideal_Opcode();
 9189     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9190     int mask_len = Matcher::vector_length(this, $mask);
 9191     int mask_size = mask_len * type2aelembytes(mbt);
 9192     int vlen_enc = vector_length_encoding(this, $mask);
 9193     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9194                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9195   %}
 9196   ins_pipe( pipe_slow );
 9197 %}
 9198 
 9199 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9200   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9201   match(Set dst (VectorMaskFirstTrue mask));
 9202   match(Set dst (VectorMaskLastTrue mask));
 9203   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9204   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9205   ins_encode %{
 9206     int opcode = this->ideal_Opcode();
 9207     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9208     int mask_len = Matcher::vector_length(this, $mask);
 9209     int vlen_enc = vector_length_encoding(this, $mask);
 9210     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9211                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9212   %}
 9213   ins_pipe( pipe_slow );
 9214 %}
 9215 
 9216 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9217   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9218   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9219   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9220   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9221   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9222   ins_encode %{
 9223     int opcode = this->ideal_Opcode();
 9224     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9225     int mask_len = Matcher::vector_length(this, $mask);
 9226     int vlen_enc = vector_length_encoding(this, $mask);
 9227     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9228                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9229   %}
 9230   ins_pipe( pipe_slow );
 9231 %}
 9232 
 9233 // --------------------------------- Compress/Expand Operations ---------------------------
 9234 
 9235 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9236   match(Set dst (CompressV src mask));
 9237   match(Set dst (ExpandV src mask));
 9238   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9239   ins_encode %{
 9240     int opcode = this->ideal_Opcode();
 9241     int vector_len = vector_length_encoding(this);
 9242     BasicType bt  = Matcher::vector_element_basic_type(this);
 9243     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9244   %}
 9245   ins_pipe( pipe_slow );
 9246 %}
 9247 
 9248 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9249   match(Set dst (CompressM mask));
 9250   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9251   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9252   ins_encode %{
 9253     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9254     int mask_len = Matcher::vector_length(this);
 9255     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9256   %}
 9257   ins_pipe( pipe_slow );
 9258 %}
 9259 
 9260 #endif // _LP64
 9261 
 9262 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9263 
 9264 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9265   predicate(!VM_Version::supports_gfni());
 9266   match(Set dst (ReverseV src));
 9267   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9268   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9269   ins_encode %{
 9270     int vec_enc = vector_length_encoding(this);
 9271     BasicType bt = Matcher::vector_element_basic_type(this);
 9272     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9273                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9274   %}
 9275   ins_pipe( pipe_slow );
 9276 %}
 9277 
 9278 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9279   predicate(VM_Version::supports_gfni());
 9280   match(Set dst (ReverseV src));
 9281   effect(TEMP dst, TEMP xtmp);
 9282   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9283   ins_encode %{
 9284     int vec_enc = vector_length_encoding(this);
 9285     BasicType bt  = Matcher::vector_element_basic_type(this);
 9286     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9287     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9288                                $xtmp$$XMMRegister);
 9289   %}
 9290   ins_pipe( pipe_slow );
 9291 %}
 9292 
 9293 instruct vreverse_byte_reg(vec dst, vec src) %{
 9294   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9295   match(Set dst (ReverseBytesV src));
 9296   effect(TEMP dst);
 9297   format %{ "vector_reverse_byte $dst, $src" %}
 9298   ins_encode %{
 9299     int vec_enc = vector_length_encoding(this);
 9300     BasicType bt = Matcher::vector_element_basic_type(this);
 9301     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9302   %}
 9303   ins_pipe( pipe_slow );
 9304 %}
 9305 
 9306 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9307   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9308   match(Set dst (ReverseBytesV src));
 9309   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9310   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9311   ins_encode %{
 9312     int vec_enc = vector_length_encoding(this);
 9313     BasicType bt = Matcher::vector_element_basic_type(this);
 9314     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9315                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9316   %}
 9317   ins_pipe( pipe_slow );
 9318 %}
 9319 
 9320 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9321 
 9322 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9323   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9324                                               Matcher::vector_length_in_bytes(n->in(1))));
 9325   match(Set dst (CountLeadingZerosV src));
 9326   format %{ "vector_count_leading_zeros $dst, $src" %}
 9327   ins_encode %{
 9328      int vlen_enc = vector_length_encoding(this, $src);
 9329      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9330      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9331                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9332   %}
 9333   ins_pipe( pipe_slow );
 9334 %}
 9335 
 9336 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9337   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9338                                               Matcher::vector_length_in_bytes(n->in(1))));
 9339   match(Set dst (CountLeadingZerosV src mask));
 9340   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9341   ins_encode %{
 9342     int vlen_enc = vector_length_encoding(this, $src);
 9343     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9344     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9345     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9346                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9347   %}
 9348   ins_pipe( pipe_slow );
 9349 %}
 9350 
 9351 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9352   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9353             VM_Version::supports_avx512cd() &&
 9354             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9355   match(Set dst (CountLeadingZerosV src));
 9356   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9357   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9358   ins_encode %{
 9359     int vlen_enc = vector_length_encoding(this, $src);
 9360     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9361     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9362                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9363   %}
 9364   ins_pipe( pipe_slow );
 9365 %}
 9366 
 9367 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9368   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9369   match(Set dst (CountLeadingZerosV src));
 9370   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9371   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9372   ins_encode %{
 9373     int vlen_enc = vector_length_encoding(this, $src);
 9374     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9375     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9376                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9377                                        $rtmp$$Register, true, vlen_enc);
 9378   %}
 9379   ins_pipe( pipe_slow );
 9380 %}
 9381 
 9382 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9383   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9384             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9385   match(Set dst (CountLeadingZerosV src));
 9386   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9387   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9388   ins_encode %{
 9389     int vlen_enc = vector_length_encoding(this, $src);
 9390     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9391     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9392                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9393   %}
 9394   ins_pipe( pipe_slow );
 9395 %}
 9396 
 9397 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9398   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9399             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9400   match(Set dst (CountLeadingZerosV src));
 9401   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9402   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9403   ins_encode %{
 9404     int vlen_enc = vector_length_encoding(this, $src);
 9405     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9406     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9407                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9408   %}
 9409   ins_pipe( pipe_slow );
 9410 %}
 9411 
 9412 // ---------------------------------- Vector Masked Operations ------------------------------------
 9413 
 9414 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9415   match(Set dst (AddVB (Binary dst src2) mask));
 9416   match(Set dst (AddVS (Binary dst src2) mask));
 9417   match(Set dst (AddVI (Binary dst src2) mask));
 9418   match(Set dst (AddVL (Binary dst src2) mask));
 9419   match(Set dst (AddVF (Binary dst src2) mask));
 9420   match(Set dst (AddVD (Binary dst src2) mask));
 9421   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9422   ins_encode %{
 9423     int vlen_enc = vector_length_encoding(this);
 9424     BasicType bt = Matcher::vector_element_basic_type(this);
 9425     int opc = this->ideal_Opcode();
 9426     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9427                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9428   %}
 9429   ins_pipe( pipe_slow );
 9430 %}
 9431 
 9432 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9433   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9434   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9435   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9436   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9437   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9438   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9439   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9440   ins_encode %{
 9441     int vlen_enc = vector_length_encoding(this);
 9442     BasicType bt = Matcher::vector_element_basic_type(this);
 9443     int opc = this->ideal_Opcode();
 9444     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9445                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9446   %}
 9447   ins_pipe( pipe_slow );
 9448 %}
 9449 
 9450 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9451   match(Set dst (XorV (Binary dst src2) mask));
 9452   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9453   ins_encode %{
 9454     int vlen_enc = vector_length_encoding(this);
 9455     BasicType bt = Matcher::vector_element_basic_type(this);
 9456     int opc = this->ideal_Opcode();
 9457     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9458                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9459   %}
 9460   ins_pipe( pipe_slow );
 9461 %}
 9462 
 9463 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9464   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9465   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9466   ins_encode %{
 9467     int vlen_enc = vector_length_encoding(this);
 9468     BasicType bt = Matcher::vector_element_basic_type(this);
 9469     int opc = this->ideal_Opcode();
 9470     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9471                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9472   %}
 9473   ins_pipe( pipe_slow );
 9474 %}
 9475 
 9476 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9477   match(Set dst (OrV (Binary dst src2) mask));
 9478   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9479   ins_encode %{
 9480     int vlen_enc = vector_length_encoding(this);
 9481     BasicType bt = Matcher::vector_element_basic_type(this);
 9482     int opc = this->ideal_Opcode();
 9483     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9484                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9485   %}
 9486   ins_pipe( pipe_slow );
 9487 %}
 9488 
 9489 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9490   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9491   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9492   ins_encode %{
 9493     int vlen_enc = vector_length_encoding(this);
 9494     BasicType bt = Matcher::vector_element_basic_type(this);
 9495     int opc = this->ideal_Opcode();
 9496     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9497                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9498   %}
 9499   ins_pipe( pipe_slow );
 9500 %}
 9501 
 9502 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9503   match(Set dst (AndV (Binary dst src2) mask));
 9504   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9505   ins_encode %{
 9506     int vlen_enc = vector_length_encoding(this);
 9507     BasicType bt = Matcher::vector_element_basic_type(this);
 9508     int opc = this->ideal_Opcode();
 9509     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9510                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9511   %}
 9512   ins_pipe( pipe_slow );
 9513 %}
 9514 
 9515 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9516   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9517   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9518   ins_encode %{
 9519     int vlen_enc = vector_length_encoding(this);
 9520     BasicType bt = Matcher::vector_element_basic_type(this);
 9521     int opc = this->ideal_Opcode();
 9522     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9523                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9524   %}
 9525   ins_pipe( pipe_slow );
 9526 %}
 9527 
 9528 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9529   match(Set dst (SubVB (Binary dst src2) mask));
 9530   match(Set dst (SubVS (Binary dst src2) mask));
 9531   match(Set dst (SubVI (Binary dst src2) mask));
 9532   match(Set dst (SubVL (Binary dst src2) mask));
 9533   match(Set dst (SubVF (Binary dst src2) mask));
 9534   match(Set dst (SubVD (Binary dst src2) mask));
 9535   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9536   ins_encode %{
 9537     int vlen_enc = vector_length_encoding(this);
 9538     BasicType bt = Matcher::vector_element_basic_type(this);
 9539     int opc = this->ideal_Opcode();
 9540     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9541                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9542   %}
 9543   ins_pipe( pipe_slow );
 9544 %}
 9545 
 9546 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9547   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9548   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9549   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9550   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9551   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9552   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9553   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9554   ins_encode %{
 9555     int vlen_enc = vector_length_encoding(this);
 9556     BasicType bt = Matcher::vector_element_basic_type(this);
 9557     int opc = this->ideal_Opcode();
 9558     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9559                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9560   %}
 9561   ins_pipe( pipe_slow );
 9562 %}
 9563 
 9564 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9565   match(Set dst (MulVS (Binary dst src2) mask));
 9566   match(Set dst (MulVI (Binary dst src2) mask));
 9567   match(Set dst (MulVL (Binary dst src2) mask));
 9568   match(Set dst (MulVF (Binary dst src2) mask));
 9569   match(Set dst (MulVD (Binary dst src2) mask));
 9570   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9571   ins_encode %{
 9572     int vlen_enc = vector_length_encoding(this);
 9573     BasicType bt = Matcher::vector_element_basic_type(this);
 9574     int opc = this->ideal_Opcode();
 9575     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9576                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9577   %}
 9578   ins_pipe( pipe_slow );
 9579 %}
 9580 
 9581 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9582   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9583   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9584   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9585   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9586   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9587   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9588   ins_encode %{
 9589     int vlen_enc = vector_length_encoding(this);
 9590     BasicType bt = Matcher::vector_element_basic_type(this);
 9591     int opc = this->ideal_Opcode();
 9592     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9593                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9594   %}
 9595   ins_pipe( pipe_slow );
 9596 %}
 9597 
 9598 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9599   match(Set dst (SqrtVF dst mask));
 9600   match(Set dst (SqrtVD dst mask));
 9601   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9602   ins_encode %{
 9603     int vlen_enc = vector_length_encoding(this);
 9604     BasicType bt = Matcher::vector_element_basic_type(this);
 9605     int opc = this->ideal_Opcode();
 9606     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9607                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9608   %}
 9609   ins_pipe( pipe_slow );
 9610 %}
 9611 
 9612 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9613   match(Set dst (DivVF (Binary dst src2) mask));
 9614   match(Set dst (DivVD (Binary dst src2) mask));
 9615   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9616   ins_encode %{
 9617     int vlen_enc = vector_length_encoding(this);
 9618     BasicType bt = Matcher::vector_element_basic_type(this);
 9619     int opc = this->ideal_Opcode();
 9620     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9621                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9622   %}
 9623   ins_pipe( pipe_slow );
 9624 %}
 9625 
 9626 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9627   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9628   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9629   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9630   ins_encode %{
 9631     int vlen_enc = vector_length_encoding(this);
 9632     BasicType bt = Matcher::vector_element_basic_type(this);
 9633     int opc = this->ideal_Opcode();
 9634     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9635                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9636   %}
 9637   ins_pipe( pipe_slow );
 9638 %}
 9639 
 9640 
 9641 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9642   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9643   match(Set dst (RotateRightV (Binary dst shift) mask));
 9644   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9645   ins_encode %{
 9646     int vlen_enc = vector_length_encoding(this);
 9647     BasicType bt = Matcher::vector_element_basic_type(this);
 9648     int opc = this->ideal_Opcode();
 9649     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9650                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9651   %}
 9652   ins_pipe( pipe_slow );
 9653 %}
 9654 
 9655 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9656   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9657   match(Set dst (RotateRightV (Binary dst src2) mask));
 9658   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9659   ins_encode %{
 9660     int vlen_enc = vector_length_encoding(this);
 9661     BasicType bt = Matcher::vector_element_basic_type(this);
 9662     int opc = this->ideal_Opcode();
 9663     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9664                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9665   %}
 9666   ins_pipe( pipe_slow );
 9667 %}
 9668 
 9669 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9670   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9671   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9672   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9673   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9674   ins_encode %{
 9675     int vlen_enc = vector_length_encoding(this);
 9676     BasicType bt = Matcher::vector_element_basic_type(this);
 9677     int opc = this->ideal_Opcode();
 9678     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9679                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9680   %}
 9681   ins_pipe( pipe_slow );
 9682 %}
 9683 
 9684 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9685   predicate(!n->as_ShiftV()->is_var_shift());
 9686   match(Set dst (LShiftVS (Binary dst src2) mask));
 9687   match(Set dst (LShiftVI (Binary dst src2) mask));
 9688   match(Set dst (LShiftVL (Binary dst src2) mask));
 9689   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9690   ins_encode %{
 9691     int vlen_enc = vector_length_encoding(this);
 9692     BasicType bt = Matcher::vector_element_basic_type(this);
 9693     int opc = this->ideal_Opcode();
 9694     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9695                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9696   %}
 9697   ins_pipe( pipe_slow );
 9698 %}
 9699 
 9700 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9701   predicate(n->as_ShiftV()->is_var_shift());
 9702   match(Set dst (LShiftVS (Binary dst src2) mask));
 9703   match(Set dst (LShiftVI (Binary dst src2) mask));
 9704   match(Set dst (LShiftVL (Binary dst src2) mask));
 9705   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9706   ins_encode %{
 9707     int vlen_enc = vector_length_encoding(this);
 9708     BasicType bt = Matcher::vector_element_basic_type(this);
 9709     int opc = this->ideal_Opcode();
 9710     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9711                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9712   %}
 9713   ins_pipe( pipe_slow );
 9714 %}
 9715 
 9716 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9717   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9718   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9719   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9720   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9721   ins_encode %{
 9722     int vlen_enc = vector_length_encoding(this);
 9723     BasicType bt = Matcher::vector_element_basic_type(this);
 9724     int opc = this->ideal_Opcode();
 9725     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9726                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9727   %}
 9728   ins_pipe( pipe_slow );
 9729 %}
 9730 
 9731 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9732   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9733   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9734   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9735   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9736   ins_encode %{
 9737     int vlen_enc = vector_length_encoding(this);
 9738     BasicType bt = Matcher::vector_element_basic_type(this);
 9739     int opc = this->ideal_Opcode();
 9740     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9741                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9742   %}
 9743   ins_pipe( pipe_slow );
 9744 %}
 9745 
 9746 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9747   predicate(!n->as_ShiftV()->is_var_shift());
 9748   match(Set dst (RShiftVS (Binary dst src2) mask));
 9749   match(Set dst (RShiftVI (Binary dst src2) mask));
 9750   match(Set dst (RShiftVL (Binary dst src2) mask));
 9751   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9752   ins_encode %{
 9753     int vlen_enc = vector_length_encoding(this);
 9754     BasicType bt = Matcher::vector_element_basic_type(this);
 9755     int opc = this->ideal_Opcode();
 9756     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9757                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9758   %}
 9759   ins_pipe( pipe_slow );
 9760 %}
 9761 
 9762 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9763   predicate(n->as_ShiftV()->is_var_shift());
 9764   match(Set dst (RShiftVS (Binary dst src2) mask));
 9765   match(Set dst (RShiftVI (Binary dst src2) mask));
 9766   match(Set dst (RShiftVL (Binary dst src2) mask));
 9767   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9768   ins_encode %{
 9769     int vlen_enc = vector_length_encoding(this);
 9770     BasicType bt = Matcher::vector_element_basic_type(this);
 9771     int opc = this->ideal_Opcode();
 9772     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9773                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9774   %}
 9775   ins_pipe( pipe_slow );
 9776 %}
 9777 
 9778 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9779   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9780   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9781   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9782   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9783   ins_encode %{
 9784     int vlen_enc = vector_length_encoding(this);
 9785     BasicType bt = Matcher::vector_element_basic_type(this);
 9786     int opc = this->ideal_Opcode();
 9787     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9788                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9789   %}
 9790   ins_pipe( pipe_slow );
 9791 %}
 9792 
 9793 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9794   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9795   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9796   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9797   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9798   ins_encode %{
 9799     int vlen_enc = vector_length_encoding(this);
 9800     BasicType bt = Matcher::vector_element_basic_type(this);
 9801     int opc = this->ideal_Opcode();
 9802     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9803                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9804   %}
 9805   ins_pipe( pipe_slow );
 9806 %}
 9807 
 9808 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9809   predicate(!n->as_ShiftV()->is_var_shift());
 9810   match(Set dst (URShiftVS (Binary dst src2) mask));
 9811   match(Set dst (URShiftVI (Binary dst src2) mask));
 9812   match(Set dst (URShiftVL (Binary dst src2) mask));
 9813   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9814   ins_encode %{
 9815     int vlen_enc = vector_length_encoding(this);
 9816     BasicType bt = Matcher::vector_element_basic_type(this);
 9817     int opc = this->ideal_Opcode();
 9818     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9819                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9820   %}
 9821   ins_pipe( pipe_slow );
 9822 %}
 9823 
 9824 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9825   predicate(n->as_ShiftV()->is_var_shift());
 9826   match(Set dst (URShiftVS (Binary dst src2) mask));
 9827   match(Set dst (URShiftVI (Binary dst src2) mask));
 9828   match(Set dst (URShiftVL (Binary dst src2) mask));
 9829   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9830   ins_encode %{
 9831     int vlen_enc = vector_length_encoding(this);
 9832     BasicType bt = Matcher::vector_element_basic_type(this);
 9833     int opc = this->ideal_Opcode();
 9834     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9835                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9836   %}
 9837   ins_pipe( pipe_slow );
 9838 %}
 9839 
 9840 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9841   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9842   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9843   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9844   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9845   ins_encode %{
 9846     int vlen_enc = vector_length_encoding(this);
 9847     BasicType bt = Matcher::vector_element_basic_type(this);
 9848     int opc = this->ideal_Opcode();
 9849     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9850                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9851   %}
 9852   ins_pipe( pipe_slow );
 9853 %}
 9854 
 9855 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9856   match(Set dst (MaxV (Binary dst src2) mask));
 9857   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9858   ins_encode %{
 9859     int vlen_enc = vector_length_encoding(this);
 9860     BasicType bt = Matcher::vector_element_basic_type(this);
 9861     int opc = this->ideal_Opcode();
 9862     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9863                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9864   %}
 9865   ins_pipe( pipe_slow );
 9866 %}
 9867 
 9868 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9869   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9870   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9871   ins_encode %{
 9872     int vlen_enc = vector_length_encoding(this);
 9873     BasicType bt = Matcher::vector_element_basic_type(this);
 9874     int opc = this->ideal_Opcode();
 9875     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9876                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9877   %}
 9878   ins_pipe( pipe_slow );
 9879 %}
 9880 
 9881 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9882   match(Set dst (MinV (Binary dst src2) mask));
 9883   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9884   ins_encode %{
 9885     int vlen_enc = vector_length_encoding(this);
 9886     BasicType bt = Matcher::vector_element_basic_type(this);
 9887     int opc = this->ideal_Opcode();
 9888     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9889                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9890   %}
 9891   ins_pipe( pipe_slow );
 9892 %}
 9893 
 9894 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9895   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9896   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9897   ins_encode %{
 9898     int vlen_enc = vector_length_encoding(this);
 9899     BasicType bt = Matcher::vector_element_basic_type(this);
 9900     int opc = this->ideal_Opcode();
 9901     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9902                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9903   %}
 9904   ins_pipe( pipe_slow );
 9905 %}
 9906 
 9907 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9908   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9909   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9910   ins_encode %{
 9911     int vlen_enc = vector_length_encoding(this);
 9912     BasicType bt = Matcher::vector_element_basic_type(this);
 9913     int opc = this->ideal_Opcode();
 9914     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9915                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9916   %}
 9917   ins_pipe( pipe_slow );
 9918 %}
 9919 
 9920 instruct vabs_masked(vec dst, kReg mask) %{
 9921   match(Set dst (AbsVB dst mask));
 9922   match(Set dst (AbsVS dst mask));
 9923   match(Set dst (AbsVI dst mask));
 9924   match(Set dst (AbsVL dst mask));
 9925   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9926   ins_encode %{
 9927     int vlen_enc = vector_length_encoding(this);
 9928     BasicType bt = Matcher::vector_element_basic_type(this);
 9929     int opc = this->ideal_Opcode();
 9930     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9931                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9932   %}
 9933   ins_pipe( pipe_slow );
 9934 %}
 9935 
 9936 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9937   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9938   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9939   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9940   ins_encode %{
 9941     int vlen_enc = vector_length_encoding(this);
 9942     BasicType bt = Matcher::vector_element_basic_type(this);
 9943     int opc = this->ideal_Opcode();
 9944     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9945                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9946   %}
 9947   ins_pipe( pipe_slow );
 9948 %}
 9949 
 9950 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9951   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9952   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9953   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9954   ins_encode %{
 9955     int vlen_enc = vector_length_encoding(this);
 9956     BasicType bt = Matcher::vector_element_basic_type(this);
 9957     int opc = this->ideal_Opcode();
 9958     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9959                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9960   %}
 9961   ins_pipe( pipe_slow );
 9962 %}
 9963 
 9964 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
 9965   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
 9966   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
 9967   ins_encode %{
 9968     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 9969     int vlen_enc = vector_length_encoding(this, $src1);
 9970     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9971 
 9972     // Comparison i
 9973     switch (src1_elem_bt) {
 9974       case T_BYTE: {
 9975         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9976         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9977         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9978         break;
 9979       }
 9980       case T_SHORT: {
 9981         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9982         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9983         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9984         break;
 9985       }
 9986       case T_INT: {
 9987         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9988         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9989         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9990         break;
 9991       }
 9992       case T_LONG: {
 9993         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9994         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9995         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9996         break;
 9997       }
 9998       case T_FLOAT: {
 9999         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10000         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10001         break;
10002       }
10003       case T_DOUBLE: {
10004         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10005         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10006         break;
10007       }
10008       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10009     }
10010   %}
10011   ins_pipe( pipe_slow );
10012 %}
10013 
10014 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10015   predicate(Matcher::vector_length(n) <= 32);
10016   match(Set dst (MaskAll src));
10017   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10018   ins_encode %{
10019     int mask_len = Matcher::vector_length(this);
10020     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10021   %}
10022   ins_pipe( pipe_slow );
10023 %}
10024 
10025 #ifdef _LP64
10026 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10027   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10028   match(Set dst (XorVMask src (MaskAll cnt)));
10029   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10030   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10031   ins_encode %{
10032     uint masklen = Matcher::vector_length(this);
10033     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10034   %}
10035   ins_pipe( pipe_slow );
10036 %}
10037 
10038 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10039   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10040             (Matcher::vector_length(n) == 16) ||
10041             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10042   match(Set dst (XorVMask src (MaskAll cnt)));
10043   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10044   ins_encode %{
10045     uint masklen = Matcher::vector_length(this);
10046     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10047   %}
10048   ins_pipe( pipe_slow );
10049 %}
10050 
10051 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10052   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
10053   match(Set dst (VectorLongToMask src));
10054   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10055   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10056   ins_encode %{
10057     int mask_len = Matcher::vector_length(this);
10058     int vec_enc  = vector_length_encoding(mask_len);
10059     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10060                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10061   %}
10062   ins_pipe( pipe_slow );
10063 %}
10064 
10065 
10066 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10067   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
10068   match(Set dst (VectorLongToMask src));
10069   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10070   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10071   ins_encode %{
10072     int mask_len = Matcher::vector_length(this);
10073     assert(mask_len <= 32, "invalid mask length");
10074     int vec_enc  = vector_length_encoding(mask_len);
10075     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10076                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10077   %}
10078   ins_pipe( pipe_slow );
10079 %}
10080 
10081 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10082   predicate(n->bottom_type()->isa_vectmask());
10083   match(Set dst (VectorLongToMask src));
10084   format %{ "long_to_mask_evex $dst, $src\t!" %}
10085   ins_encode %{
10086     __ kmov($dst$$KRegister, $src$$Register);
10087   %}
10088   ins_pipe( pipe_slow );
10089 %}
10090 #endif
10091 
10092 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10093   match(Set dst (AndVMask src1 src2));
10094   match(Set dst (OrVMask src1 src2));
10095   match(Set dst (XorVMask src1 src2));
10096   effect(TEMP kscratch);
10097   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10098   ins_encode %{
10099     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10100     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10101     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10102     uint masklen = Matcher::vector_length(this);
10103     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10104     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10105   %}
10106   ins_pipe( pipe_slow );
10107 %}
10108 
10109 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10110   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10111   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10112   ins_encode %{
10113     int vlen_enc = vector_length_encoding(this);
10114     BasicType bt = Matcher::vector_element_basic_type(this);
10115     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10116                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10117   %}
10118   ins_pipe( pipe_slow );
10119 %}
10120 
10121 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10122   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10123   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10124   ins_encode %{
10125     int vlen_enc = vector_length_encoding(this);
10126     BasicType bt = Matcher::vector_element_basic_type(this);
10127     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10128                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10129   %}
10130   ins_pipe( pipe_slow );
10131 %}
10132 
10133 instruct castMM(kReg dst)
10134 %{
10135   match(Set dst (CastVV dst));
10136 
10137   size(0);
10138   format %{ "# castVV of $dst" %}
10139   ins_encode(/* empty encoding */);
10140   ins_cost(0);
10141   ins_pipe(empty);
10142 %}
10143 
10144 instruct castVV(vec dst)
10145 %{
10146   match(Set dst (CastVV dst));
10147 
10148   size(0);
10149   format %{ "# castVV of $dst" %}
10150   ins_encode(/* empty encoding */);
10151   ins_cost(0);
10152   ins_pipe(empty);
10153 %}
10154 
10155 instruct castVVLeg(legVec dst)
10156 %{
10157   match(Set dst (CastVV dst));
10158 
10159   size(0);
10160   format %{ "# castVV of $dst" %}
10161   ins_encode(/* empty encoding */);
10162   ins_cost(0);
10163   ins_pipe(empty);
10164 %}
10165 
10166 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10167 %{
10168   match(Set dst (IsInfiniteF src));
10169   effect(TEMP ktmp, KILL cr);
10170   format %{ "float_class_check $dst, $src" %}
10171   ins_encode %{
10172     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10173     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10174   %}
10175   ins_pipe(pipe_slow);
10176 %}
10177 
10178 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10179 %{
10180   match(Set dst (IsInfiniteD src));
10181   effect(TEMP ktmp, KILL cr);
10182   format %{ "double_class_check $dst, $src" %}
10183   ins_encode %{
10184     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10185     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10186   %}
10187   ins_pipe(pipe_slow);
10188 %}
10189 
10190