1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(C2_MacroAssembler *masm);
 1191   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   address base = __ start_a_stub(size_exception_handler());
 1314   if (base == nullptr) {
 1315     ciEnv::current()->record_failure("CodeCache is full");
 1316     return 0;  // CodeBuffer::expand failed
 1317   }
 1318   int offset = __ offset();
 1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1321   __ end_a_stub();
 1322   return offset;
 1323 }
 1324 
 1325 // Emit deopt handler code.
 1326 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1327 
 1328   // Note that the code buffer's insts_mark is always relative to insts.
 1329   // That's why we must use the macroassembler to generate a handler.
 1330   address base = __ start_a_stub(size_deopt_handler());
 1331   if (base == nullptr) {
 1332     ciEnv::current()->record_failure("CodeCache is full");
 1333     return 0;  // CodeBuffer::expand failed
 1334   }
 1335   int offset = __ offset();
 1336 
 1337 #ifdef _LP64
 1338   address the_pc = (address) __ pc();
 1339   Label next;
 1340   // push a "the_pc" on the stack without destroying any registers
 1341   // as they all may be live.
 1342 
 1343   // push address of "next"
 1344   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1345   __ bind(next);
 1346   // adjust it so it matches "the_pc"
 1347   __ subptr(Address(rsp, 0), __ offset() - offset);
 1348 #else
 1349   InternalAddress here(__ pc());
 1350   __ pushptr(here.addr(), noreg);
 1351 #endif
 1352 
 1353   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1354   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1355   __ end_a_stub();
 1356   return offset;
 1357 }
 1358 
 1359 static Assembler::Width widthForType(BasicType bt) {
 1360   if (bt == T_BYTE) {
 1361     return Assembler::B;
 1362   } else if (bt == T_SHORT) {
 1363     return Assembler::W;
 1364   } else if (bt == T_INT) {
 1365     return Assembler::D;
 1366   } else {
 1367     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1368     return Assembler::Q;
 1369   }
 1370 }
 1371 
 1372 //=============================================================================
 1373 
 1374   // Float masks come from different places depending on platform.
 1375 #ifdef _LP64
 1376   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1377   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1378   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1379   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1380 #else
 1381   static address float_signmask()  { return (address)float_signmask_pool; }
 1382   static address float_signflip()  { return (address)float_signflip_pool; }
 1383   static address double_signmask() { return (address)double_signmask_pool; }
 1384   static address double_signflip() { return (address)double_signflip_pool; }
 1385 #endif
 1386   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1387   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1388   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1389   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1390   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1391   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1392   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1393   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1394   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1395   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1396   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1397   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1398   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1399   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1400   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1401 
 1402 //=============================================================================
 1403 bool Matcher::match_rule_supported(int opcode) {
 1404   if (!has_match_rule(opcode)) {
 1405     return false; // no match rule present
 1406   }
 1407   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1408   switch (opcode) {
 1409     case Op_AbsVL:
 1410     case Op_StoreVectorScatter:
 1411       if (UseAVX < 3) {
 1412         return false;
 1413       }
 1414       break;
 1415     case Op_PopCountI:
 1416     case Op_PopCountL:
 1417       if (!UsePopCountInstruction) {
 1418         return false;
 1419       }
 1420       break;
 1421     case Op_PopCountVI:
 1422       if (UseAVX < 2) {
 1423         return false;
 1424       }
 1425       break;
 1426     case Op_CompressV:
 1427     case Op_ExpandV:
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_AddHF:
 1465     case Op_DivHF:
 1466     case Op_FmaHF:
 1467     case Op_MaxHF:
 1468     case Op_MinHF:
 1469     case Op_MulHF:
 1470     case Op_ReinterpretS2HF:
 1471     case Op_ReinterpretHF2S:
 1472     case Op_SubHF:
 1473     case Op_SqrtHF:
 1474       if (!VM_Version::supports_avx512_fp16()) {
 1475         return false;
 1476       }
 1477       break;
 1478     case Op_VectorLoadShuffle:
 1479     case Op_VectorRearrange:
 1480     case Op_MulReductionVI:
 1481       if (UseSSE < 4) { // requires at least SSE4
 1482         return false;
 1483       }
 1484       break;
 1485     case Op_IsInfiniteF:
 1486     case Op_IsInfiniteD:
 1487       if (!VM_Version::supports_avx512dq()) {
 1488         return false;
 1489       }
 1490       break;
 1491     case Op_SqrtVD:
 1492     case Op_SqrtVF:
 1493     case Op_VectorMaskCmp:
 1494     case Op_VectorCastB2X:
 1495     case Op_VectorCastS2X:
 1496     case Op_VectorCastI2X:
 1497     case Op_VectorCastL2X:
 1498     case Op_VectorCastF2X:
 1499     case Op_VectorCastD2X:
 1500     case Op_VectorUCastB2X:
 1501     case Op_VectorUCastS2X:
 1502     case Op_VectorUCastI2X:
 1503     case Op_VectorMaskCast:
 1504       if (UseAVX < 1) { // enabled for AVX only
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_PopulateIndex:
 1509       if (!is_LP64 || (UseAVX < 2)) {
 1510         return false;
 1511       }
 1512       break;
 1513     case Op_RoundVF:
 1514       if (UseAVX < 2) { // enabled for AVX2 only
 1515         return false;
 1516       }
 1517       break;
 1518     case Op_RoundVD:
 1519       if (UseAVX < 3) {
 1520         return false;  // enabled for AVX3 only
 1521       }
 1522       break;
 1523     case Op_CompareAndSwapL:
 1524 #ifdef _LP64
 1525     case Op_CompareAndSwapP:
 1526 #endif
 1527       break;
 1528     case Op_StrIndexOf:
 1529       if (!UseSSE42Intrinsics) {
 1530         return false;
 1531       }
 1532       break;
 1533     case Op_StrIndexOfChar:
 1534       if (!UseSSE42Intrinsics) {
 1535         return false;
 1536       }
 1537       break;
 1538     case Op_OnSpinWait:
 1539       if (VM_Version::supports_on_spin_wait() == false) {
 1540         return false;
 1541       }
 1542       break;
 1543     case Op_MulVB:
 1544     case Op_LShiftVB:
 1545     case Op_RShiftVB:
 1546     case Op_URShiftVB:
 1547     case Op_VectorInsert:
 1548     case Op_VectorLoadMask:
 1549     case Op_VectorStoreMask:
 1550     case Op_VectorBlend:
 1551       if (UseSSE < 4) {
 1552         return false;
 1553       }
 1554       break;
 1555 #ifdef _LP64
 1556     case Op_MaxD:
 1557     case Op_MaxF:
 1558     case Op_MinD:
 1559     case Op_MinF:
 1560       if (UseAVX < 1) { // enabled for AVX only
 1561         return false;
 1562       }
 1563       break;
 1564 #endif
 1565     case Op_CacheWB:
 1566     case Op_CacheWBPreSync:
 1567     case Op_CacheWBPostSync:
 1568       if (!VM_Version::supports_data_cache_line_flush()) {
 1569         return false;
 1570       }
 1571       break;
 1572     case Op_ExtractB:
 1573     case Op_ExtractL:
 1574     case Op_ExtractI:
 1575     case Op_RoundDoubleMode:
 1576       if (UseSSE < 4) {
 1577         return false;
 1578       }
 1579       break;
 1580     case Op_RoundDoubleModeV:
 1581       if (VM_Version::supports_avx() == false) {
 1582         return false; // 128bit vroundpd is not available
 1583       }
 1584       break;
 1585     case Op_LoadVectorGather:
 1586     case Op_LoadVectorGatherMasked:
 1587       if (UseAVX < 2) {
 1588         return false;
 1589       }
 1590       break;
 1591     case Op_FmaF:
 1592     case Op_FmaD:
 1593     case Op_FmaVD:
 1594     case Op_FmaVF:
 1595       if (!UseFMA) {
 1596         return false;
 1597       }
 1598       break;
 1599     case Op_MacroLogicV:
 1600       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1601         return false;
 1602       }
 1603       break;
 1604 
 1605     case Op_VectorCmpMasked:
 1606     case Op_VectorMaskGen:
 1607       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_VectorMaskFirstTrue:
 1612     case Op_VectorMaskLastTrue:
 1613     case Op_VectorMaskTrueCount:
 1614     case Op_VectorMaskToLong:
 1615       if (!is_LP64 || UseAVX < 1) {
 1616          return false;
 1617       }
 1618       break;
 1619     case Op_RoundF:
 1620     case Op_RoundD:
 1621       if (!is_LP64) {
 1622         return false;
 1623       }
 1624       break;
 1625     case Op_CopySignD:
 1626     case Op_CopySignF:
 1627       if (UseAVX < 3 || !is_LP64)  {
 1628         return false;
 1629       }
 1630       if (!VM_Version::supports_avx512vl()) {
 1631         return false;
 1632       }
 1633       break;
 1634 #ifndef _LP64
 1635     case Op_AddReductionVF:
 1636     case Op_AddReductionVD:
 1637     case Op_MulReductionVF:
 1638     case Op_MulReductionVD:
 1639       if (UseSSE < 1) { // requires at least SSE
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_MulAddVS2VI:
 1644     case Op_RShiftVL:
 1645     case Op_AbsVD:
 1646     case Op_NegVD:
 1647       if (UseSSE < 2) {
 1648         return false;
 1649       }
 1650       break;
 1651 #endif // !LP64
 1652     case Op_CompressBits:
 1653       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1654         return false;
 1655       }
 1656       break;
 1657     case Op_ExpandBits:
 1658       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1659         return false;
 1660       }
 1661       break;
 1662     case Op_SignumF:
 1663       if (UseSSE < 1) {
 1664         return false;
 1665       }
 1666       break;
 1667     case Op_SignumD:
 1668       if (UseSSE < 2) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_CompressM:
 1673       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1674         return false;
 1675       }
 1676       break;
 1677     case Op_SqrtF:
 1678       if (UseSSE < 1) {
 1679         return false;
 1680       }
 1681       break;
 1682     case Op_SqrtD:
 1683 #ifdef _LP64
 1684       if (UseSSE < 2) {
 1685         return false;
 1686       }
 1687 #else
 1688       // x86_32.ad has a special match rule for SqrtD.
 1689       // Together with common x86 rules, this handles all UseSSE cases.
 1690 #endif
 1691       break;
 1692     case Op_ConvF2HF:
 1693     case Op_ConvHF2F:
 1694       if (!VM_Version::supports_float16()) {
 1695         return false;
 1696       }
 1697       break;
 1698     case Op_VectorCastF2HF:
 1699     case Op_VectorCastHF2F:
 1700       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1701         return false;
 1702       }
 1703       break;
 1704   }
 1705   return true;  // Match rules are supported by default.
 1706 }
 1707 
 1708 //------------------------------------------------------------------------
 1709 
 1710 static inline bool is_pop_count_instr_target(BasicType bt) {
 1711   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1712          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1713 }
 1714 
 1715 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1716   return match_rule_supported_vector(opcode, vlen, bt);
 1717 }
 1718 
 1719 // Identify extra cases that we might want to provide match rules for vector nodes and
 1720 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1721 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1722   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1723   if (!match_rule_supported(opcode)) {
 1724     return false;
 1725   }
 1726   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1727   //   * SSE2 supports 128bit vectors for all types;
 1728   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1729   //   * AVX2 supports 256bit vectors for all types;
 1730   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1731   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1732   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1733   // And MaxVectorSize is taken into account as well.
 1734   if (!vector_size_supported(bt, vlen)) {
 1735     return false;
 1736   }
 1737   // Special cases which require vector length follow:
 1738   //   * implementation limitations
 1739   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1740   //   * 128bit vroundpd instruction is present only in AVX1
 1741   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1742   switch (opcode) {
 1743     case Op_AbsVF:
 1744     case Op_NegVF:
 1745       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1746         return false; // 512bit vandps and vxorps are not available
 1747       }
 1748       break;
 1749     case Op_AbsVD:
 1750     case Op_NegVD:
 1751       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1752         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1753       }
 1754       break;
 1755     case Op_RotateRightV:
 1756     case Op_RotateLeftV:
 1757       if (bt != T_INT && bt != T_LONG) {
 1758         return false;
 1759       } // fallthrough
 1760     case Op_MacroLogicV:
 1761       if (!VM_Version::supports_evex() ||
 1762           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1763         return false;
 1764       }
 1765       break;
 1766     case Op_ClearArray:
 1767     case Op_VectorMaskGen:
 1768     case Op_VectorCmpMasked:
 1769       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1770         return false;
 1771       }
 1772       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1773         return false;
 1774       }
 1775       break;
 1776     case Op_LoadVectorMasked:
 1777     case Op_StoreVectorMasked:
 1778       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1779         return false;
 1780       }
 1781       break;
 1782     case Op_UMinV:
 1783     case Op_UMaxV:
 1784       if (UseAVX == 0) {
 1785         return false;
 1786       }
 1787       break;
 1788     case Op_MaxV:
 1789     case Op_MinV:
 1790       if (UseSSE < 4 && is_integral_type(bt)) {
 1791         return false;
 1792       }
 1793       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1794           // Float/Double intrinsics are enabled for AVX family currently.
 1795           if (UseAVX == 0) {
 1796             return false;
 1797           }
 1798           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1799             return false;
 1800           }
 1801       }
 1802       break;
 1803     case Op_CallLeafVector:
 1804       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_AddReductionVI:
 1809       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1810         return false;
 1811       }
 1812       // fallthrough
 1813     case Op_AndReductionV:
 1814     case Op_OrReductionV:
 1815     case Op_XorReductionV:
 1816       if (is_subword_type(bt) && (UseSSE < 4)) {
 1817         return false;
 1818       }
 1819 #ifndef _LP64
 1820       if (bt == T_BYTE || bt == T_LONG) {
 1821         return false;
 1822       }
 1823 #endif
 1824       break;
 1825 #ifndef _LP64
 1826     case Op_VectorInsert:
 1827       if (bt == T_LONG || bt == T_DOUBLE) {
 1828         return false;
 1829       }
 1830       break;
 1831 #endif
 1832     case Op_MinReductionV:
 1833     case Op_MaxReductionV:
 1834       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1835         return false;
 1836       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1837         return false;
 1838       }
 1839       // Float/Double intrinsics enabled for AVX family.
 1840       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1841         return false;
 1842       }
 1843       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1844         return false;
 1845       }
 1846 #ifndef _LP64
 1847       if (bt == T_BYTE || bt == T_LONG) {
 1848         return false;
 1849       }
 1850 #endif
 1851       break;
 1852     case Op_VectorTest:
 1853       if (UseSSE < 4) {
 1854         return false; // Implementation limitation
 1855       } else if (size_in_bits < 32) {
 1856         return false; // Implementation limitation
 1857       }
 1858       break;
 1859     case Op_VectorLoadShuffle:
 1860     case Op_VectorRearrange:
 1861       if(vlen == 2) {
 1862         return false; // Implementation limitation due to how shuffle is loaded
 1863       } else if (size_in_bits == 256 && UseAVX < 2) {
 1864         return false; // Implementation limitation
 1865       }
 1866       break;
 1867     case Op_VectorLoadMask:
 1868     case Op_VectorMaskCast:
 1869       if (size_in_bits == 256 && UseAVX < 2) {
 1870         return false; // Implementation limitation
 1871       }
 1872       // fallthrough
 1873     case Op_VectorStoreMask:
 1874       if (vlen == 2) {
 1875         return false; // Implementation limitation
 1876       }
 1877       break;
 1878     case Op_PopulateIndex:
 1879       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1880         return false;
 1881       }
 1882       break;
 1883     case Op_VectorCastB2X:
 1884     case Op_VectorCastS2X:
 1885     case Op_VectorCastI2X:
 1886       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1887         return false;
 1888       }
 1889       break;
 1890     case Op_VectorCastL2X:
 1891       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1892         return false;
 1893       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1894         return false;
 1895       }
 1896       break;
 1897     case Op_VectorCastF2X: {
 1898         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1899         // happen after intermediate conversion to integer and special handling
 1900         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1901         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1902         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1903           return false;
 1904         }
 1905       }
 1906       // fallthrough
 1907     case Op_VectorCastD2X:
 1908       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1909         return false;
 1910       }
 1911       break;
 1912     case Op_VectorCastF2HF:
 1913     case Op_VectorCastHF2F:
 1914       if (!VM_Version::supports_f16c() &&
 1915          ((!VM_Version::supports_evex() ||
 1916          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1917         return false;
 1918       }
 1919       break;
 1920     case Op_RoundVD:
 1921       if (!VM_Version::supports_avx512dq()) {
 1922         return false;
 1923       }
 1924       break;
 1925     case Op_MulReductionVI:
 1926       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1927         return false;
 1928       }
 1929       break;
 1930     case Op_LoadVectorGatherMasked:
 1931       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1932         return false;
 1933       }
 1934       if (is_subword_type(bt) &&
 1935          (!is_LP64                                                ||
 1936          (size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1937          (size_in_bits < 64)                                      ||
 1938          (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1939         return false;
 1940       }
 1941       break;
 1942     case Op_StoreVectorScatterMasked:
 1943     case Op_StoreVectorScatter:
 1944       if (is_subword_type(bt)) {
 1945         return false;
 1946       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1947         return false;
 1948       }
 1949       // fallthrough
 1950     case Op_LoadVectorGather:
 1951       if (!is_subword_type(bt) && size_in_bits == 64) {
 1952         return false;
 1953       }
 1954       if (is_subword_type(bt) && size_in_bits < 64) {
 1955         return false;
 1956       }
 1957       break;
 1958     case Op_SaturatingAddV:
 1959     case Op_SaturatingSubV:
 1960       if (UseAVX < 1) {
 1961         return false; // Implementation limitation
 1962       }
 1963       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1964         return false;
 1965       }
 1966       break;
 1967     case Op_SelectFromTwoVector:
 1968        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1969          return false;
 1970        }
 1971        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1972          return false;
 1973        }
 1974        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1975          return false;
 1976        }
 1977        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1978          return false;
 1979        }
 1980        break;
 1981     case Op_MaskAll:
 1982       if (!VM_Version::supports_evex()) {
 1983         return false;
 1984       }
 1985       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1986         return false;
 1987       }
 1988       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1989         return false;
 1990       }
 1991       break;
 1992     case Op_VectorMaskCmp:
 1993       if (vlen < 2 || size_in_bits < 32) {
 1994         return false;
 1995       }
 1996       break;
 1997     case Op_CompressM:
 1998       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1999         return false;
 2000       }
 2001       break;
 2002     case Op_CompressV:
 2003     case Op_ExpandV:
 2004       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 2005         return false;
 2006       }
 2007       if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
 2008         return false;
 2009       }
 2010       if (size_in_bits < 128 ) {
 2011         return false;
 2012       }
 2013     case Op_VectorLongToMask:
 2014       if (UseAVX < 1 || !is_LP64) {
 2015         return false;
 2016       }
 2017       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 2018         return false;
 2019       }
 2020       break;
 2021     case Op_SignumVD:
 2022     case Op_SignumVF:
 2023       if (UseAVX < 1) {
 2024         return false;
 2025       }
 2026       break;
 2027     case Op_PopCountVI:
 2028     case Op_PopCountVL: {
 2029         if (!is_pop_count_instr_target(bt) &&
 2030             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 2031           return false;
 2032         }
 2033       }
 2034       break;
 2035     case Op_ReverseV:
 2036     case Op_ReverseBytesV:
 2037       if (UseAVX < 2) {
 2038         return false;
 2039       }
 2040       break;
 2041     case Op_CountTrailingZerosV:
 2042     case Op_CountLeadingZerosV:
 2043       if (UseAVX < 2) {
 2044         return false;
 2045       }
 2046       break;
 2047   }
 2048   return true;  // Per default match rules are supported.
 2049 }
 2050 
 2051 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2052   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2053   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2054   // of their non-masked counterpart with mask edge being the differentiator.
 2055   // This routine does a strict check on the existence of masked operation patterns
 2056   // by returning a default false value for all the other opcodes apart from the
 2057   // ones whose masked instruction patterns are defined in this file.
 2058   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2059     return false;
 2060   }
 2061 
 2062   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2063   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2064   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2065     return false;
 2066   }
 2067   switch(opcode) {
 2068     // Unary masked operations
 2069     case Op_AbsVB:
 2070     case Op_AbsVS:
 2071       if(!VM_Version::supports_avx512bw()) {
 2072         return false;  // Implementation limitation
 2073       }
 2074     case Op_AbsVI:
 2075     case Op_AbsVL:
 2076       return true;
 2077 
 2078     // Ternary masked operations
 2079     case Op_FmaVF:
 2080     case Op_FmaVD:
 2081       return true;
 2082 
 2083     case Op_MacroLogicV:
 2084       if(bt != T_INT && bt != T_LONG) {
 2085         return false;
 2086       }
 2087       return true;
 2088 
 2089     // Binary masked operations
 2090     case Op_AddVB:
 2091     case Op_AddVS:
 2092     case Op_SubVB:
 2093     case Op_SubVS:
 2094     case Op_MulVS:
 2095     case Op_LShiftVS:
 2096     case Op_RShiftVS:
 2097     case Op_URShiftVS:
 2098       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2099       if (!VM_Version::supports_avx512bw()) {
 2100         return false;  // Implementation limitation
 2101       }
 2102       return true;
 2103 
 2104     case Op_MulVL:
 2105       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2106       if (!VM_Version::supports_avx512dq()) {
 2107         return false;  // Implementation limitation
 2108       }
 2109       return true;
 2110 
 2111     case Op_AndV:
 2112     case Op_OrV:
 2113     case Op_XorV:
 2114     case Op_RotateRightV:
 2115     case Op_RotateLeftV:
 2116       if (bt != T_INT && bt != T_LONG) {
 2117         return false; // Implementation limitation
 2118       }
 2119       return true;
 2120 
 2121     case Op_VectorLoadMask:
 2122       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2123       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2124         return false;
 2125       }
 2126       return true;
 2127 
 2128     case Op_AddVI:
 2129     case Op_AddVL:
 2130     case Op_AddVF:
 2131     case Op_AddVD:
 2132     case Op_SubVI:
 2133     case Op_SubVL:
 2134     case Op_SubVF:
 2135     case Op_SubVD:
 2136     case Op_MulVI:
 2137     case Op_MulVF:
 2138     case Op_MulVD:
 2139     case Op_DivVF:
 2140     case Op_DivVD:
 2141     case Op_SqrtVF:
 2142     case Op_SqrtVD:
 2143     case Op_LShiftVI:
 2144     case Op_LShiftVL:
 2145     case Op_RShiftVI:
 2146     case Op_RShiftVL:
 2147     case Op_URShiftVI:
 2148     case Op_URShiftVL:
 2149     case Op_LoadVectorMasked:
 2150     case Op_StoreVectorMasked:
 2151     case Op_LoadVectorGatherMasked:
 2152     case Op_StoreVectorScatterMasked:
 2153       return true;
 2154 
 2155     case Op_UMinV:
 2156     case Op_UMaxV:
 2157       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2158         return false;
 2159       } // fallthrough
 2160     case Op_MaxV:
 2161     case Op_MinV:
 2162       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2163         return false; // Implementation limitation
 2164       }
 2165       if (is_floating_point_type(bt)) {
 2166         return false; // Implementation limitation
 2167       }
 2168       return true;
 2169     case Op_SaturatingAddV:
 2170     case Op_SaturatingSubV:
 2171       if (!is_subword_type(bt)) {
 2172         return false;
 2173       }
 2174       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2175         return false; // Implementation limitation
 2176       }
 2177       return true;
 2178 
 2179     case Op_VectorMaskCmp:
 2180       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2181         return false; // Implementation limitation
 2182       }
 2183       return true;
 2184 
 2185     case Op_VectorRearrange:
 2186       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2187         return false; // Implementation limitation
 2188       }
 2189       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2190         return false; // Implementation limitation
 2191       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2192         return false; // Implementation limitation
 2193       }
 2194       return true;
 2195 
 2196     // Binary Logical operations
 2197     case Op_AndVMask:
 2198     case Op_OrVMask:
 2199     case Op_XorVMask:
 2200       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2201         return false; // Implementation limitation
 2202       }
 2203       return true;
 2204 
 2205     case Op_PopCountVI:
 2206     case Op_PopCountVL:
 2207       if (!is_pop_count_instr_target(bt)) {
 2208         return false;
 2209       }
 2210       return true;
 2211 
 2212     case Op_MaskAll:
 2213       return true;
 2214 
 2215     case Op_CountLeadingZerosV:
 2216       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2217         return true;
 2218       }
 2219     default:
 2220       return false;
 2221   }
 2222 }
 2223 
 2224 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2225   return false;
 2226 }
 2227 
 2228 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2229 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2230   switch (elem_bt) {
 2231     case T_BYTE:  return false;
 2232     case T_SHORT: return !VM_Version::supports_avx512bw();
 2233     case T_INT:   return !VM_Version::supports_avx();
 2234     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2235     default:
 2236       ShouldNotReachHere();
 2237       return false;
 2238   }
 2239 }
 2240 
 2241 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2242   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2243   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2244   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2245       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2246     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2247     return new legVecZOper();
 2248   }
 2249   if (legacy) {
 2250     switch (ideal_reg) {
 2251       case Op_VecS: return new legVecSOper();
 2252       case Op_VecD: return new legVecDOper();
 2253       case Op_VecX: return new legVecXOper();
 2254       case Op_VecY: return new legVecYOper();
 2255       case Op_VecZ: return new legVecZOper();
 2256     }
 2257   } else {
 2258     switch (ideal_reg) {
 2259       case Op_VecS: return new vecSOper();
 2260       case Op_VecD: return new vecDOper();
 2261       case Op_VecX: return new vecXOper();
 2262       case Op_VecY: return new vecYOper();
 2263       case Op_VecZ: return new vecZOper();
 2264     }
 2265   }
 2266   ShouldNotReachHere();
 2267   return nullptr;
 2268 }
 2269 
 2270 bool Matcher::is_reg2reg_move(MachNode* m) {
 2271   switch (m->rule()) {
 2272     case MoveVec2Leg_rule:
 2273     case MoveLeg2Vec_rule:
 2274     case MoveF2VL_rule:
 2275     case MoveF2LEG_rule:
 2276     case MoveVL2F_rule:
 2277     case MoveLEG2F_rule:
 2278     case MoveD2VL_rule:
 2279     case MoveD2LEG_rule:
 2280     case MoveVL2D_rule:
 2281     case MoveLEG2D_rule:
 2282       return true;
 2283     default:
 2284       return false;
 2285   }
 2286 }
 2287 
 2288 bool Matcher::is_generic_vector(MachOper* opnd) {
 2289   switch (opnd->opcode()) {
 2290     case VEC:
 2291     case LEGVEC:
 2292       return true;
 2293     default:
 2294       return false;
 2295   }
 2296 }
 2297 
 2298 //------------------------------------------------------------------------
 2299 
 2300 const RegMask* Matcher::predicate_reg_mask(void) {
 2301   return &_VECTMASK_REG_mask;
 2302 }
 2303 
 2304 // Max vector size in bytes. 0 if not supported.
 2305 int Matcher::vector_width_in_bytes(BasicType bt) {
 2306   assert(is_java_primitive(bt), "only primitive type vectors");
 2307   if (UseSSE < 2) return 0;
 2308   // SSE2 supports 128bit vectors for all types.
 2309   // AVX2 supports 256bit vectors for all types.
 2310   // AVX2/EVEX supports 512bit vectors for all types.
 2311   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2312   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2313   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2314     size = (UseAVX > 2) ? 64 : 32;
 2315   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2316     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2317   // Use flag to limit vector size.
 2318   size = MIN2(size,(int)MaxVectorSize);
 2319   // Minimum 2 values in vector (or 4 for bytes).
 2320   switch (bt) {
 2321   case T_DOUBLE:
 2322   case T_LONG:
 2323     if (size < 16) return 0;
 2324     break;
 2325   case T_FLOAT:
 2326   case T_INT:
 2327     if (size < 8) return 0;
 2328     break;
 2329   case T_BOOLEAN:
 2330     if (size < 4) return 0;
 2331     break;
 2332   case T_CHAR:
 2333     if (size < 4) return 0;
 2334     break;
 2335   case T_BYTE:
 2336     if (size < 4) return 0;
 2337     break;
 2338   case T_SHORT:
 2339     if (size < 4) return 0;
 2340     break;
 2341   default:
 2342     ShouldNotReachHere();
 2343   }
 2344   return size;
 2345 }
 2346 
 2347 // Limits on vector size (number of elements) loaded into vector.
 2348 int Matcher::max_vector_size(const BasicType bt) {
 2349   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2350 }
 2351 int Matcher::min_vector_size(const BasicType bt) {
 2352   int max_size = max_vector_size(bt);
 2353   // Min size which can be loaded into vector is 4 bytes.
 2354   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2355   // Support for calling svml double64 vectors
 2356   if (bt == T_DOUBLE) {
 2357     size = 1;
 2358   }
 2359   return MIN2(size,max_size);
 2360 }
 2361 
 2362 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2363   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2364   // by default on Cascade Lake
 2365   if (VM_Version::is_default_intel_cascade_lake()) {
 2366     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2367   }
 2368   return Matcher::max_vector_size(bt);
 2369 }
 2370 
 2371 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2372   return -1;
 2373 }
 2374 
 2375 // Vector ideal reg corresponding to specified size in bytes
 2376 uint Matcher::vector_ideal_reg(int size) {
 2377   assert(MaxVectorSize >= size, "");
 2378   switch(size) {
 2379     case  4: return Op_VecS;
 2380     case  8: return Op_VecD;
 2381     case 16: return Op_VecX;
 2382     case 32: return Op_VecY;
 2383     case 64: return Op_VecZ;
 2384   }
 2385   ShouldNotReachHere();
 2386   return 0;
 2387 }
 2388 
 2389 // Check for shift by small constant as well
 2390 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2391   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2392       shift->in(2)->get_int() <= 3 &&
 2393       // Are there other uses besides address expressions?
 2394       !matcher->is_visited(shift)) {
 2395     address_visited.set(shift->_idx); // Flag as address_visited
 2396     mstack.push(shift->in(2), Matcher::Visit);
 2397     Node *conv = shift->in(1);
 2398 #ifdef _LP64
 2399     // Allow Matcher to match the rule which bypass
 2400     // ConvI2L operation for an array index on LP64
 2401     // if the index value is positive.
 2402     if (conv->Opcode() == Op_ConvI2L &&
 2403         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2404         // Are there other uses besides address expressions?
 2405         !matcher->is_visited(conv)) {
 2406       address_visited.set(conv->_idx); // Flag as address_visited
 2407       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2408     } else
 2409 #endif
 2410       mstack.push(conv, Matcher::Pre_Visit);
 2411     return true;
 2412   }
 2413   return false;
 2414 }
 2415 
 2416 // This function identifies sub-graphs in which a 'load' node is
 2417 // input to two different nodes, and such that it can be matched
 2418 // with BMI instructions like blsi, blsr, etc.
 2419 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2420 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2421 // refers to the same node.
 2422 //
 2423 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2424 // This is a temporary solution until we make DAGs expressible in ADL.
 2425 template<typename ConType>
 2426 class FusedPatternMatcher {
 2427   Node* _op1_node;
 2428   Node* _mop_node;
 2429   int _con_op;
 2430 
 2431   static int match_next(Node* n, int next_op, int next_op_idx) {
 2432     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2433       return -1;
 2434     }
 2435 
 2436     if (next_op_idx == -1) { // n is commutative, try rotations
 2437       if (n->in(1)->Opcode() == next_op) {
 2438         return 1;
 2439       } else if (n->in(2)->Opcode() == next_op) {
 2440         return 2;
 2441       }
 2442     } else {
 2443       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2444       if (n->in(next_op_idx)->Opcode() == next_op) {
 2445         return next_op_idx;
 2446       }
 2447     }
 2448     return -1;
 2449   }
 2450 
 2451  public:
 2452   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2453     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2454 
 2455   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2456              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2457              typename ConType::NativeType con_value) {
 2458     if (_op1_node->Opcode() != op1) {
 2459       return false;
 2460     }
 2461     if (_mop_node->outcnt() > 2) {
 2462       return false;
 2463     }
 2464     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2465     if (op1_op2_idx == -1) {
 2466       return false;
 2467     }
 2468     // Memory operation must be the other edge
 2469     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2470 
 2471     // Check that the mop node is really what we want
 2472     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2473       Node* op2_node = _op1_node->in(op1_op2_idx);
 2474       if (op2_node->outcnt() > 1) {
 2475         return false;
 2476       }
 2477       assert(op2_node->Opcode() == op2, "Should be");
 2478       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2479       if (op2_con_idx == -1) {
 2480         return false;
 2481       }
 2482       // Memory operation must be the other edge
 2483       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2484       // Check that the memory operation is the same node
 2485       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2486         // Now check the constant
 2487         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2488         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2489           return true;
 2490         }
 2491       }
 2492     }
 2493     return false;
 2494   }
 2495 };
 2496 
 2497 static bool is_bmi_pattern(Node* n, Node* m) {
 2498   assert(UseBMI1Instructions, "sanity");
 2499   if (n != nullptr && m != nullptr) {
 2500     if (m->Opcode() == Op_LoadI) {
 2501       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2502       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2503              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2504              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2505     } else if (m->Opcode() == Op_LoadL) {
 2506       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2507       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2508              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2509              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2510     }
 2511   }
 2512   return false;
 2513 }
 2514 
 2515 // Should the matcher clone input 'm' of node 'n'?
 2516 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2517   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2518   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2519     mstack.push(m, Visit);
 2520     return true;
 2521   }
 2522   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2523     mstack.push(m, Visit);           // m = ShiftCntV
 2524     return true;
 2525   }
 2526   if (is_encode_and_store_pattern(n, m)) {
 2527     mstack.push(m, Visit);
 2528     return true;
 2529   }
 2530   return false;
 2531 }
 2532 
 2533 // Should the Matcher clone shifts on addressing modes, expecting them
 2534 // to be subsumed into complex addressing expressions or compute them
 2535 // into registers?
 2536 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2537   Node *off = m->in(AddPNode::Offset);
 2538   if (off->is_Con()) {
 2539     address_visited.test_set(m->_idx); // Flag as address_visited
 2540     Node *adr = m->in(AddPNode::Address);
 2541 
 2542     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2543     // AtomicAdd is not an addressing expression.
 2544     // Cheap to find it by looking for screwy base.
 2545     if (adr->is_AddP() &&
 2546         !adr->in(AddPNode::Base)->is_top() &&
 2547         !adr->in(AddPNode::Offset)->is_Con() &&
 2548         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2549         // Are there other uses besides address expressions?
 2550         !is_visited(adr)) {
 2551       address_visited.set(adr->_idx); // Flag as address_visited
 2552       Node *shift = adr->in(AddPNode::Offset);
 2553       if (!clone_shift(shift, this, mstack, address_visited)) {
 2554         mstack.push(shift, Pre_Visit);
 2555       }
 2556       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2557       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2558     } else {
 2559       mstack.push(adr, Pre_Visit);
 2560     }
 2561 
 2562     // Clone X+offset as it also folds into most addressing expressions
 2563     mstack.push(off, Visit);
 2564     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2565     return true;
 2566   } else if (clone_shift(off, this, mstack, address_visited)) {
 2567     address_visited.test_set(m->_idx); // Flag as address_visited
 2568     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2569     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2570     return true;
 2571   }
 2572   return false;
 2573 }
 2574 
 2575 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2576   switch (bt) {
 2577     case BoolTest::eq:
 2578       return Assembler::eq;
 2579     case BoolTest::ne:
 2580       return Assembler::neq;
 2581     case BoolTest::le:
 2582     case BoolTest::ule:
 2583       return Assembler::le;
 2584     case BoolTest::ge:
 2585     case BoolTest::uge:
 2586       return Assembler::nlt;
 2587     case BoolTest::lt:
 2588     case BoolTest::ult:
 2589       return Assembler::lt;
 2590     case BoolTest::gt:
 2591     case BoolTest::ugt:
 2592       return Assembler::nle;
 2593     default : ShouldNotReachHere(); return Assembler::_false;
 2594   }
 2595 }
 2596 
 2597 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2598   switch (bt) {
 2599   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2600   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2601   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2602   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2603   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2604   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2605   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2606   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2607   }
 2608 }
 2609 
 2610 // Helper methods for MachSpillCopyNode::implementation().
 2611 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2612                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2613   assert(ireg == Op_VecS || // 32bit vector
 2614          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2615           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2616          "no non-adjacent vector moves" );
 2617   if (masm) {
 2618     switch (ireg) {
 2619     case Op_VecS: // copy whole register
 2620     case Op_VecD:
 2621     case Op_VecX:
 2622 #ifndef _LP64
 2623       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2624 #else
 2625       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2626         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2627       } else {
 2628         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2629      }
 2630 #endif
 2631       break;
 2632     case Op_VecY:
 2633 #ifndef _LP64
 2634       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2635 #else
 2636       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2637         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2638       } else {
 2639         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2640      }
 2641 #endif
 2642       break;
 2643     case Op_VecZ:
 2644       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2645       break;
 2646     default:
 2647       ShouldNotReachHere();
 2648     }
 2649 #ifndef PRODUCT
 2650   } else {
 2651     switch (ireg) {
 2652     case Op_VecS:
 2653     case Op_VecD:
 2654     case Op_VecX:
 2655       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2656       break;
 2657     case Op_VecY:
 2658     case Op_VecZ:
 2659       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2660       break;
 2661     default:
 2662       ShouldNotReachHere();
 2663     }
 2664 #endif
 2665   }
 2666 }
 2667 
 2668 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2669                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2670   if (masm) {
 2671     if (is_load) {
 2672       switch (ireg) {
 2673       case Op_VecS:
 2674         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2675         break;
 2676       case Op_VecD:
 2677         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2678         break;
 2679       case Op_VecX:
 2680 #ifndef _LP64
 2681         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2682 #else
 2683         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2684           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2685         } else {
 2686           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2687           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2688         }
 2689 #endif
 2690         break;
 2691       case Op_VecY:
 2692 #ifndef _LP64
 2693         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2694 #else
 2695         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2696           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2697         } else {
 2698           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2699           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2700         }
 2701 #endif
 2702         break;
 2703       case Op_VecZ:
 2704         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2705         break;
 2706       default:
 2707         ShouldNotReachHere();
 2708       }
 2709     } else { // store
 2710       switch (ireg) {
 2711       case Op_VecS:
 2712         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2713         break;
 2714       case Op_VecD:
 2715         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2716         break;
 2717       case Op_VecX:
 2718 #ifndef _LP64
 2719         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2720 #else
 2721         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2722           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2723         }
 2724         else {
 2725           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2726         }
 2727 #endif
 2728         break;
 2729       case Op_VecY:
 2730 #ifndef _LP64
 2731         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2732 #else
 2733         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2734           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2735         }
 2736         else {
 2737           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2738         }
 2739 #endif
 2740         break;
 2741       case Op_VecZ:
 2742         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2743         break;
 2744       default:
 2745         ShouldNotReachHere();
 2746       }
 2747     }
 2748 #ifndef PRODUCT
 2749   } else {
 2750     if (is_load) {
 2751       switch (ireg) {
 2752       case Op_VecS:
 2753         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2754         break;
 2755       case Op_VecD:
 2756         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2757         break;
 2758        case Op_VecX:
 2759         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2760         break;
 2761       case Op_VecY:
 2762       case Op_VecZ:
 2763         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2764         break;
 2765       default:
 2766         ShouldNotReachHere();
 2767       }
 2768     } else { // store
 2769       switch (ireg) {
 2770       case Op_VecS:
 2771         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2772         break;
 2773       case Op_VecD:
 2774         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2775         break;
 2776        case Op_VecX:
 2777         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2778         break;
 2779       case Op_VecY:
 2780       case Op_VecZ:
 2781         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2782         break;
 2783       default:
 2784         ShouldNotReachHere();
 2785       }
 2786     }
 2787 #endif
 2788   }
 2789 }
 2790 
 2791 template <class T>
 2792 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2793   int size = type2aelembytes(bt) * len;
 2794   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2795   for (int i = 0; i < len; i++) {
 2796     int offset = i * type2aelembytes(bt);
 2797     switch (bt) {
 2798       case T_BYTE: val->at(i) = con; break;
 2799       case T_SHORT: {
 2800         jshort c = con;
 2801         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2802         break;
 2803       }
 2804       case T_INT: {
 2805         jint c = con;
 2806         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2807         break;
 2808       }
 2809       case T_LONG: {
 2810         jlong c = con;
 2811         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2812         break;
 2813       }
 2814       case T_FLOAT: {
 2815         jfloat c = con;
 2816         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2817         break;
 2818       }
 2819       case T_DOUBLE: {
 2820         jdouble c = con;
 2821         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2822         break;
 2823       }
 2824       default: assert(false, "%s", type2name(bt));
 2825     }
 2826   }
 2827   return val;
 2828 }
 2829 
 2830 static inline jlong high_bit_set(BasicType bt) {
 2831   switch (bt) {
 2832     case T_BYTE:  return 0x8080808080808080;
 2833     case T_SHORT: return 0x8000800080008000;
 2834     case T_INT:   return 0x8000000080000000;
 2835     case T_LONG:  return 0x8000000000000000;
 2836     default:
 2837       ShouldNotReachHere();
 2838       return 0;
 2839   }
 2840 }
 2841 
 2842 #ifndef PRODUCT
 2843   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2844     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2845   }
 2846 #endif
 2847 
 2848   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2849     __ nop(_count);
 2850   }
 2851 
 2852   uint MachNopNode::size(PhaseRegAlloc*) const {
 2853     return _count;
 2854   }
 2855 
 2856 #ifndef PRODUCT
 2857   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2858     st->print("# breakpoint");
 2859   }
 2860 #endif
 2861 
 2862   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2863     __ int3();
 2864   }
 2865 
 2866   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2867     return MachNode::size(ra_);
 2868   }
 2869 
 2870 %}
 2871 
 2872 encode %{
 2873 
 2874   enc_class call_epilog %{
 2875     if (VerifyStackAtCalls) {
 2876       // Check that stack depth is unchanged: find majik cookie on stack
 2877       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2878       Label L;
 2879       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2880       __ jccb(Assembler::equal, L);
 2881       // Die if stack mismatch
 2882       __ int3();
 2883       __ bind(L);
 2884     }
 2885     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2886       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2887       // Search for the corresponding projection, get the register and emit code that initialized it.
 2888       uint con = (tf()->range_cc()->cnt() - 1);
 2889       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2890         ProjNode* proj = fast_out(i)->as_Proj();
 2891         if (proj->_con == con) {
 2892           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2893           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2894           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2895           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2896           __ testq(rax, rax);
 2897           __ setb(Assembler::notZero, toReg);
 2898           __ movzbl(toReg, toReg);
 2899           if (reg->is_stack()) {
 2900             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2901             __ movq(Address(rsp, st_off), toReg);
 2902           }
 2903           break;
 2904         }
 2905       }
 2906       if (return_value_is_used()) {
 2907         // An inline type is returned as fields in multiple registers.
 2908         // Rax either contains an oop if the inline type is buffered or a pointer
 2909         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2910         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2911         // rax &= (rax & 1) - 1
 2912         __ movptr(rscratch1, rax);
 2913         __ andptr(rscratch1, 0x1);
 2914         __ subptr(rscratch1, 0x1);
 2915         __ andptr(rax, rscratch1);
 2916       }
 2917     }
 2918   %}
 2919 
 2920 %}
 2921 
 2922 // Operands for bound floating pointer register arguments
 2923 operand rxmm0() %{
 2924   constraint(ALLOC_IN_RC(xmm0_reg));
 2925   match(VecX);
 2926   format%{%}
 2927   interface(REG_INTER);
 2928 %}
 2929 
 2930 //----------OPERANDS-----------------------------------------------------------
 2931 // Operand definitions must precede instruction definitions for correct parsing
 2932 // in the ADLC because operands constitute user defined types which are used in
 2933 // instruction definitions.
 2934 
 2935 // Vectors
 2936 
 2937 // Dummy generic vector class. Should be used for all vector operands.
 2938 // Replaced with vec[SDXYZ] during post-selection pass.
 2939 operand vec() %{
 2940   constraint(ALLOC_IN_RC(dynamic));
 2941   match(VecX);
 2942   match(VecY);
 2943   match(VecZ);
 2944   match(VecS);
 2945   match(VecD);
 2946 
 2947   format %{ %}
 2948   interface(REG_INTER);
 2949 %}
 2950 
 2951 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2952 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2953 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2954 // runtime code generation via reg_class_dynamic.
 2955 operand legVec() %{
 2956   constraint(ALLOC_IN_RC(dynamic));
 2957   match(VecX);
 2958   match(VecY);
 2959   match(VecZ);
 2960   match(VecS);
 2961   match(VecD);
 2962 
 2963   format %{ %}
 2964   interface(REG_INTER);
 2965 %}
 2966 
 2967 // Replaces vec during post-selection cleanup. See above.
 2968 operand vecS() %{
 2969   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2970   match(VecS);
 2971 
 2972   format %{ %}
 2973   interface(REG_INTER);
 2974 %}
 2975 
 2976 // Replaces legVec during post-selection cleanup. See above.
 2977 operand legVecS() %{
 2978   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2979   match(VecS);
 2980 
 2981   format %{ %}
 2982   interface(REG_INTER);
 2983 %}
 2984 
 2985 // Replaces vec during post-selection cleanup. See above.
 2986 operand vecD() %{
 2987   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2988   match(VecD);
 2989 
 2990   format %{ %}
 2991   interface(REG_INTER);
 2992 %}
 2993 
 2994 // Replaces legVec during post-selection cleanup. See above.
 2995 operand legVecD() %{
 2996   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2997   match(VecD);
 2998 
 2999   format %{ %}
 3000   interface(REG_INTER);
 3001 %}
 3002 
 3003 // Replaces vec during post-selection cleanup. See above.
 3004 operand vecX() %{
 3005   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 3006   match(VecX);
 3007 
 3008   format %{ %}
 3009   interface(REG_INTER);
 3010 %}
 3011 
 3012 // Replaces legVec during post-selection cleanup. See above.
 3013 operand legVecX() %{
 3014   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 3015   match(VecX);
 3016 
 3017   format %{ %}
 3018   interface(REG_INTER);
 3019 %}
 3020 
 3021 // Replaces vec during post-selection cleanup. See above.
 3022 operand vecY() %{
 3023   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 3024   match(VecY);
 3025 
 3026   format %{ %}
 3027   interface(REG_INTER);
 3028 %}
 3029 
 3030 // Replaces legVec during post-selection cleanup. See above.
 3031 operand legVecY() %{
 3032   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 3033   match(VecY);
 3034 
 3035   format %{ %}
 3036   interface(REG_INTER);
 3037 %}
 3038 
 3039 // Replaces vec during post-selection cleanup. See above.
 3040 operand vecZ() %{
 3041   constraint(ALLOC_IN_RC(vectorz_reg));
 3042   match(VecZ);
 3043 
 3044   format %{ %}
 3045   interface(REG_INTER);
 3046 %}
 3047 
 3048 // Replaces legVec during post-selection cleanup. See above.
 3049 operand legVecZ() %{
 3050   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 3051   match(VecZ);
 3052 
 3053   format %{ %}
 3054   interface(REG_INTER);
 3055 %}
 3056 
 3057 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 3058 
 3059 // ============================================================================
 3060 
 3061 instruct ShouldNotReachHere() %{
 3062   match(Halt);
 3063   format %{ "stop\t# ShouldNotReachHere" %}
 3064   ins_encode %{
 3065     if (is_reachable()) {
 3066       __ stop(_halt_reason);
 3067     }
 3068   %}
 3069   ins_pipe(pipe_slow);
 3070 %}
 3071 
 3072 // ============================================================================
 3073 
 3074 instruct addF_reg(regF dst, regF src) %{
 3075   predicate((UseSSE>=1) && (UseAVX == 0));
 3076   match(Set dst (AddF dst src));
 3077 
 3078   format %{ "addss   $dst, $src" %}
 3079   ins_cost(150);
 3080   ins_encode %{
 3081     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3082   %}
 3083   ins_pipe(pipe_slow);
 3084 %}
 3085 
 3086 instruct addF_mem(regF dst, memory src) %{
 3087   predicate((UseSSE>=1) && (UseAVX == 0));
 3088   match(Set dst (AddF dst (LoadF src)));
 3089 
 3090   format %{ "addss   $dst, $src" %}
 3091   ins_cost(150);
 3092   ins_encode %{
 3093     __ addss($dst$$XMMRegister, $src$$Address);
 3094   %}
 3095   ins_pipe(pipe_slow);
 3096 %}
 3097 
 3098 instruct addF_imm(regF dst, immF con) %{
 3099   predicate((UseSSE>=1) && (UseAVX == 0));
 3100   match(Set dst (AddF dst con));
 3101   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3102   ins_cost(150);
 3103   ins_encode %{
 3104     __ addss($dst$$XMMRegister, $constantaddress($con));
 3105   %}
 3106   ins_pipe(pipe_slow);
 3107 %}
 3108 
 3109 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3110   predicate(UseAVX > 0);
 3111   match(Set dst (AddF src1 src2));
 3112 
 3113   format %{ "vaddss  $dst, $src1, $src2" %}
 3114   ins_cost(150);
 3115   ins_encode %{
 3116     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3117   %}
 3118   ins_pipe(pipe_slow);
 3119 %}
 3120 
 3121 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3122   predicate(UseAVX > 0);
 3123   match(Set dst (AddF src1 (LoadF src2)));
 3124 
 3125   format %{ "vaddss  $dst, $src1, $src2" %}
 3126   ins_cost(150);
 3127   ins_encode %{
 3128     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3129   %}
 3130   ins_pipe(pipe_slow);
 3131 %}
 3132 
 3133 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3134   predicate(UseAVX > 0);
 3135   match(Set dst (AddF src con));
 3136 
 3137   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3138   ins_cost(150);
 3139   ins_encode %{
 3140     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3141   %}
 3142   ins_pipe(pipe_slow);
 3143 %}
 3144 
 3145 instruct addD_reg(regD dst, regD src) %{
 3146   predicate((UseSSE>=2) && (UseAVX == 0));
 3147   match(Set dst (AddD dst src));
 3148 
 3149   format %{ "addsd   $dst, $src" %}
 3150   ins_cost(150);
 3151   ins_encode %{
 3152     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3153   %}
 3154   ins_pipe(pipe_slow);
 3155 %}
 3156 
 3157 instruct addD_mem(regD dst, memory src) %{
 3158   predicate((UseSSE>=2) && (UseAVX == 0));
 3159   match(Set dst (AddD dst (LoadD src)));
 3160 
 3161   format %{ "addsd   $dst, $src" %}
 3162   ins_cost(150);
 3163   ins_encode %{
 3164     __ addsd($dst$$XMMRegister, $src$$Address);
 3165   %}
 3166   ins_pipe(pipe_slow);
 3167 %}
 3168 
 3169 instruct addD_imm(regD dst, immD con) %{
 3170   predicate((UseSSE>=2) && (UseAVX == 0));
 3171   match(Set dst (AddD dst con));
 3172   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3173   ins_cost(150);
 3174   ins_encode %{
 3175     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3176   %}
 3177   ins_pipe(pipe_slow);
 3178 %}
 3179 
 3180 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3181   predicate(UseAVX > 0);
 3182   match(Set dst (AddD src1 src2));
 3183 
 3184   format %{ "vaddsd  $dst, $src1, $src2" %}
 3185   ins_cost(150);
 3186   ins_encode %{
 3187     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3188   %}
 3189   ins_pipe(pipe_slow);
 3190 %}
 3191 
 3192 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3193   predicate(UseAVX > 0);
 3194   match(Set dst (AddD src1 (LoadD src2)));
 3195 
 3196   format %{ "vaddsd  $dst, $src1, $src2" %}
 3197   ins_cost(150);
 3198   ins_encode %{
 3199     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3200   %}
 3201   ins_pipe(pipe_slow);
 3202 %}
 3203 
 3204 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3205   predicate(UseAVX > 0);
 3206   match(Set dst (AddD src con));
 3207 
 3208   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3209   ins_cost(150);
 3210   ins_encode %{
 3211     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3212   %}
 3213   ins_pipe(pipe_slow);
 3214 %}
 3215 
 3216 instruct subF_reg(regF dst, regF src) %{
 3217   predicate((UseSSE>=1) && (UseAVX == 0));
 3218   match(Set dst (SubF dst src));
 3219 
 3220   format %{ "subss   $dst, $src" %}
 3221   ins_cost(150);
 3222   ins_encode %{
 3223     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3224   %}
 3225   ins_pipe(pipe_slow);
 3226 %}
 3227 
 3228 instruct subF_mem(regF dst, memory src) %{
 3229   predicate((UseSSE>=1) && (UseAVX == 0));
 3230   match(Set dst (SubF dst (LoadF src)));
 3231 
 3232   format %{ "subss   $dst, $src" %}
 3233   ins_cost(150);
 3234   ins_encode %{
 3235     __ subss($dst$$XMMRegister, $src$$Address);
 3236   %}
 3237   ins_pipe(pipe_slow);
 3238 %}
 3239 
 3240 instruct subF_imm(regF dst, immF con) %{
 3241   predicate((UseSSE>=1) && (UseAVX == 0));
 3242   match(Set dst (SubF dst con));
 3243   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3244   ins_cost(150);
 3245   ins_encode %{
 3246     __ subss($dst$$XMMRegister, $constantaddress($con));
 3247   %}
 3248   ins_pipe(pipe_slow);
 3249 %}
 3250 
 3251 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3252   predicate(UseAVX > 0);
 3253   match(Set dst (SubF src1 src2));
 3254 
 3255   format %{ "vsubss  $dst, $src1, $src2" %}
 3256   ins_cost(150);
 3257   ins_encode %{
 3258     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3259   %}
 3260   ins_pipe(pipe_slow);
 3261 %}
 3262 
 3263 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3264   predicate(UseAVX > 0);
 3265   match(Set dst (SubF src1 (LoadF src2)));
 3266 
 3267   format %{ "vsubss  $dst, $src1, $src2" %}
 3268   ins_cost(150);
 3269   ins_encode %{
 3270     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3271   %}
 3272   ins_pipe(pipe_slow);
 3273 %}
 3274 
 3275 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3276   predicate(UseAVX > 0);
 3277   match(Set dst (SubF src con));
 3278 
 3279   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3280   ins_cost(150);
 3281   ins_encode %{
 3282     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3283   %}
 3284   ins_pipe(pipe_slow);
 3285 %}
 3286 
 3287 instruct subD_reg(regD dst, regD src) %{
 3288   predicate((UseSSE>=2) && (UseAVX == 0));
 3289   match(Set dst (SubD dst src));
 3290 
 3291   format %{ "subsd   $dst, $src" %}
 3292   ins_cost(150);
 3293   ins_encode %{
 3294     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3295   %}
 3296   ins_pipe(pipe_slow);
 3297 %}
 3298 
 3299 instruct subD_mem(regD dst, memory src) %{
 3300   predicate((UseSSE>=2) && (UseAVX == 0));
 3301   match(Set dst (SubD dst (LoadD src)));
 3302 
 3303   format %{ "subsd   $dst, $src" %}
 3304   ins_cost(150);
 3305   ins_encode %{
 3306     __ subsd($dst$$XMMRegister, $src$$Address);
 3307   %}
 3308   ins_pipe(pipe_slow);
 3309 %}
 3310 
 3311 instruct subD_imm(regD dst, immD con) %{
 3312   predicate((UseSSE>=2) && (UseAVX == 0));
 3313   match(Set dst (SubD dst con));
 3314   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3315   ins_cost(150);
 3316   ins_encode %{
 3317     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3318   %}
 3319   ins_pipe(pipe_slow);
 3320 %}
 3321 
 3322 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3323   predicate(UseAVX > 0);
 3324   match(Set dst (SubD src1 src2));
 3325 
 3326   format %{ "vsubsd  $dst, $src1, $src2" %}
 3327   ins_cost(150);
 3328   ins_encode %{
 3329     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3330   %}
 3331   ins_pipe(pipe_slow);
 3332 %}
 3333 
 3334 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3335   predicate(UseAVX > 0);
 3336   match(Set dst (SubD src1 (LoadD src2)));
 3337 
 3338   format %{ "vsubsd  $dst, $src1, $src2" %}
 3339   ins_cost(150);
 3340   ins_encode %{
 3341     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3342   %}
 3343   ins_pipe(pipe_slow);
 3344 %}
 3345 
 3346 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3347   predicate(UseAVX > 0);
 3348   match(Set dst (SubD src con));
 3349 
 3350   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3351   ins_cost(150);
 3352   ins_encode %{
 3353     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3354   %}
 3355   ins_pipe(pipe_slow);
 3356 %}
 3357 
 3358 instruct mulF_reg(regF dst, regF src) %{
 3359   predicate((UseSSE>=1) && (UseAVX == 0));
 3360   match(Set dst (MulF dst src));
 3361 
 3362   format %{ "mulss   $dst, $src" %}
 3363   ins_cost(150);
 3364   ins_encode %{
 3365     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3366   %}
 3367   ins_pipe(pipe_slow);
 3368 %}
 3369 
 3370 instruct mulF_mem(regF dst, memory src) %{
 3371   predicate((UseSSE>=1) && (UseAVX == 0));
 3372   match(Set dst (MulF dst (LoadF src)));
 3373 
 3374   format %{ "mulss   $dst, $src" %}
 3375   ins_cost(150);
 3376   ins_encode %{
 3377     __ mulss($dst$$XMMRegister, $src$$Address);
 3378   %}
 3379   ins_pipe(pipe_slow);
 3380 %}
 3381 
 3382 instruct mulF_imm(regF dst, immF con) %{
 3383   predicate((UseSSE>=1) && (UseAVX == 0));
 3384   match(Set dst (MulF dst con));
 3385   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3386   ins_cost(150);
 3387   ins_encode %{
 3388     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3389   %}
 3390   ins_pipe(pipe_slow);
 3391 %}
 3392 
 3393 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3394   predicate(UseAVX > 0);
 3395   match(Set dst (MulF src1 src2));
 3396 
 3397   format %{ "vmulss  $dst, $src1, $src2" %}
 3398   ins_cost(150);
 3399   ins_encode %{
 3400     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3401   %}
 3402   ins_pipe(pipe_slow);
 3403 %}
 3404 
 3405 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3406   predicate(UseAVX > 0);
 3407   match(Set dst (MulF src1 (LoadF src2)));
 3408 
 3409   format %{ "vmulss  $dst, $src1, $src2" %}
 3410   ins_cost(150);
 3411   ins_encode %{
 3412     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3413   %}
 3414   ins_pipe(pipe_slow);
 3415 %}
 3416 
 3417 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3418   predicate(UseAVX > 0);
 3419   match(Set dst (MulF src con));
 3420 
 3421   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3422   ins_cost(150);
 3423   ins_encode %{
 3424     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3425   %}
 3426   ins_pipe(pipe_slow);
 3427 %}
 3428 
 3429 instruct mulD_reg(regD dst, regD src) %{
 3430   predicate((UseSSE>=2) && (UseAVX == 0));
 3431   match(Set dst (MulD dst src));
 3432 
 3433   format %{ "mulsd   $dst, $src" %}
 3434   ins_cost(150);
 3435   ins_encode %{
 3436     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3437   %}
 3438   ins_pipe(pipe_slow);
 3439 %}
 3440 
 3441 instruct mulD_mem(regD dst, memory src) %{
 3442   predicate((UseSSE>=2) && (UseAVX == 0));
 3443   match(Set dst (MulD dst (LoadD src)));
 3444 
 3445   format %{ "mulsd   $dst, $src" %}
 3446   ins_cost(150);
 3447   ins_encode %{
 3448     __ mulsd($dst$$XMMRegister, $src$$Address);
 3449   %}
 3450   ins_pipe(pipe_slow);
 3451 %}
 3452 
 3453 instruct mulD_imm(regD dst, immD con) %{
 3454   predicate((UseSSE>=2) && (UseAVX == 0));
 3455   match(Set dst (MulD dst con));
 3456   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3457   ins_cost(150);
 3458   ins_encode %{
 3459     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3460   %}
 3461   ins_pipe(pipe_slow);
 3462 %}
 3463 
 3464 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3465   predicate(UseAVX > 0);
 3466   match(Set dst (MulD src1 src2));
 3467 
 3468   format %{ "vmulsd  $dst, $src1, $src2" %}
 3469   ins_cost(150);
 3470   ins_encode %{
 3471     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3472   %}
 3473   ins_pipe(pipe_slow);
 3474 %}
 3475 
 3476 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3477   predicate(UseAVX > 0);
 3478   match(Set dst (MulD src1 (LoadD src2)));
 3479 
 3480   format %{ "vmulsd  $dst, $src1, $src2" %}
 3481   ins_cost(150);
 3482   ins_encode %{
 3483     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3484   %}
 3485   ins_pipe(pipe_slow);
 3486 %}
 3487 
 3488 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3489   predicate(UseAVX > 0);
 3490   match(Set dst (MulD src con));
 3491 
 3492   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3493   ins_cost(150);
 3494   ins_encode %{
 3495     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3496   %}
 3497   ins_pipe(pipe_slow);
 3498 %}
 3499 
 3500 instruct divF_reg(regF dst, regF src) %{
 3501   predicate((UseSSE>=1) && (UseAVX == 0));
 3502   match(Set dst (DivF dst src));
 3503 
 3504   format %{ "divss   $dst, $src" %}
 3505   ins_cost(150);
 3506   ins_encode %{
 3507     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3508   %}
 3509   ins_pipe(pipe_slow);
 3510 %}
 3511 
 3512 instruct divF_mem(regF dst, memory src) %{
 3513   predicate((UseSSE>=1) && (UseAVX == 0));
 3514   match(Set dst (DivF dst (LoadF src)));
 3515 
 3516   format %{ "divss   $dst, $src" %}
 3517   ins_cost(150);
 3518   ins_encode %{
 3519     __ divss($dst$$XMMRegister, $src$$Address);
 3520   %}
 3521   ins_pipe(pipe_slow);
 3522 %}
 3523 
 3524 instruct divF_imm(regF dst, immF con) %{
 3525   predicate((UseSSE>=1) && (UseAVX == 0));
 3526   match(Set dst (DivF dst con));
 3527   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3528   ins_cost(150);
 3529   ins_encode %{
 3530     __ divss($dst$$XMMRegister, $constantaddress($con));
 3531   %}
 3532   ins_pipe(pipe_slow);
 3533 %}
 3534 
 3535 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3536   predicate(UseAVX > 0);
 3537   match(Set dst (DivF src1 src2));
 3538 
 3539   format %{ "vdivss  $dst, $src1, $src2" %}
 3540   ins_cost(150);
 3541   ins_encode %{
 3542     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3543   %}
 3544   ins_pipe(pipe_slow);
 3545 %}
 3546 
 3547 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3548   predicate(UseAVX > 0);
 3549   match(Set dst (DivF src1 (LoadF src2)));
 3550 
 3551   format %{ "vdivss  $dst, $src1, $src2" %}
 3552   ins_cost(150);
 3553   ins_encode %{
 3554     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3555   %}
 3556   ins_pipe(pipe_slow);
 3557 %}
 3558 
 3559 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3560   predicate(UseAVX > 0);
 3561   match(Set dst (DivF src con));
 3562 
 3563   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3564   ins_cost(150);
 3565   ins_encode %{
 3566     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3567   %}
 3568   ins_pipe(pipe_slow);
 3569 %}
 3570 
 3571 instruct divD_reg(regD dst, regD src) %{
 3572   predicate((UseSSE>=2) && (UseAVX == 0));
 3573   match(Set dst (DivD dst src));
 3574 
 3575   format %{ "divsd   $dst, $src" %}
 3576   ins_cost(150);
 3577   ins_encode %{
 3578     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3579   %}
 3580   ins_pipe(pipe_slow);
 3581 %}
 3582 
 3583 instruct divD_mem(regD dst, memory src) %{
 3584   predicate((UseSSE>=2) && (UseAVX == 0));
 3585   match(Set dst (DivD dst (LoadD src)));
 3586 
 3587   format %{ "divsd   $dst, $src" %}
 3588   ins_cost(150);
 3589   ins_encode %{
 3590     __ divsd($dst$$XMMRegister, $src$$Address);
 3591   %}
 3592   ins_pipe(pipe_slow);
 3593 %}
 3594 
 3595 instruct divD_imm(regD dst, immD con) %{
 3596   predicate((UseSSE>=2) && (UseAVX == 0));
 3597   match(Set dst (DivD dst con));
 3598   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3599   ins_cost(150);
 3600   ins_encode %{
 3601     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3602   %}
 3603   ins_pipe(pipe_slow);
 3604 %}
 3605 
 3606 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3607   predicate(UseAVX > 0);
 3608   match(Set dst (DivD src1 src2));
 3609 
 3610   format %{ "vdivsd  $dst, $src1, $src2" %}
 3611   ins_cost(150);
 3612   ins_encode %{
 3613     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3614   %}
 3615   ins_pipe(pipe_slow);
 3616 %}
 3617 
 3618 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3619   predicate(UseAVX > 0);
 3620   match(Set dst (DivD src1 (LoadD src2)));
 3621 
 3622   format %{ "vdivsd  $dst, $src1, $src2" %}
 3623   ins_cost(150);
 3624   ins_encode %{
 3625     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3626   %}
 3627   ins_pipe(pipe_slow);
 3628 %}
 3629 
 3630 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3631   predicate(UseAVX > 0);
 3632   match(Set dst (DivD src con));
 3633 
 3634   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3635   ins_cost(150);
 3636   ins_encode %{
 3637     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3638   %}
 3639   ins_pipe(pipe_slow);
 3640 %}
 3641 
 3642 instruct absF_reg(regF dst) %{
 3643   predicate((UseSSE>=1) && (UseAVX == 0));
 3644   match(Set dst (AbsF dst));
 3645   ins_cost(150);
 3646   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3647   ins_encode %{
 3648     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3649   %}
 3650   ins_pipe(pipe_slow);
 3651 %}
 3652 
 3653 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3654   predicate(UseAVX > 0);
 3655   match(Set dst (AbsF src));
 3656   ins_cost(150);
 3657   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3658   ins_encode %{
 3659     int vlen_enc = Assembler::AVX_128bit;
 3660     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3661               ExternalAddress(float_signmask()), vlen_enc);
 3662   %}
 3663   ins_pipe(pipe_slow);
 3664 %}
 3665 
 3666 instruct absD_reg(regD dst) %{
 3667   predicate((UseSSE>=2) && (UseAVX == 0));
 3668   match(Set dst (AbsD dst));
 3669   ins_cost(150);
 3670   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3671             "# abs double by sign masking" %}
 3672   ins_encode %{
 3673     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3674   %}
 3675   ins_pipe(pipe_slow);
 3676 %}
 3677 
 3678 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3679   predicate(UseAVX > 0);
 3680   match(Set dst (AbsD src));
 3681   ins_cost(150);
 3682   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3683             "# abs double by sign masking" %}
 3684   ins_encode %{
 3685     int vlen_enc = Assembler::AVX_128bit;
 3686     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3687               ExternalAddress(double_signmask()), vlen_enc);
 3688   %}
 3689   ins_pipe(pipe_slow);
 3690 %}
 3691 
 3692 instruct negF_reg(regF dst) %{
 3693   predicate((UseSSE>=1) && (UseAVX == 0));
 3694   match(Set dst (NegF dst));
 3695   ins_cost(150);
 3696   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3697   ins_encode %{
 3698     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3699   %}
 3700   ins_pipe(pipe_slow);
 3701 %}
 3702 
 3703 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3704   predicate(UseAVX > 0);
 3705   match(Set dst (NegF src));
 3706   ins_cost(150);
 3707   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3708   ins_encode %{
 3709     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3710                  ExternalAddress(float_signflip()));
 3711   %}
 3712   ins_pipe(pipe_slow);
 3713 %}
 3714 
 3715 instruct negD_reg(regD dst) %{
 3716   predicate((UseSSE>=2) && (UseAVX == 0));
 3717   match(Set dst (NegD dst));
 3718   ins_cost(150);
 3719   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3720             "# neg double by sign flipping" %}
 3721   ins_encode %{
 3722     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3723   %}
 3724   ins_pipe(pipe_slow);
 3725 %}
 3726 
 3727 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3728   predicate(UseAVX > 0);
 3729   match(Set dst (NegD src));
 3730   ins_cost(150);
 3731   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3732             "# neg double by sign flipping" %}
 3733   ins_encode %{
 3734     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3735                  ExternalAddress(double_signflip()));
 3736   %}
 3737   ins_pipe(pipe_slow);
 3738 %}
 3739 
 3740 // sqrtss instruction needs destination register to be pre initialized for best performance
 3741 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3742 instruct sqrtF_reg(regF dst) %{
 3743   predicate(UseSSE>=1);
 3744   match(Set dst (SqrtF dst));
 3745   format %{ "sqrtss  $dst, $dst" %}
 3746   ins_encode %{
 3747     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3748   %}
 3749   ins_pipe(pipe_slow);
 3750 %}
 3751 
 3752 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3753 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3754 instruct sqrtD_reg(regD dst) %{
 3755   predicate(UseSSE>=2);
 3756   match(Set dst (SqrtD dst));
 3757   format %{ "sqrtsd  $dst, $dst" %}
 3758   ins_encode %{
 3759     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3760   %}
 3761   ins_pipe(pipe_slow);
 3762 %}
 3763 
 3764 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3765   effect(TEMP tmp);
 3766   match(Set dst (ConvF2HF src));
 3767   ins_cost(125);
 3768   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3769   ins_encode %{
 3770     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3771   %}
 3772   ins_pipe( pipe_slow );
 3773 %}
 3774 
 3775 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3776   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3777   effect(TEMP ktmp, TEMP rtmp);
 3778   match(Set mem (StoreC mem (ConvF2HF src)));
 3779   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3780   ins_encode %{
 3781     __ movl($rtmp$$Register, 0x1);
 3782     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3783     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3784   %}
 3785   ins_pipe( pipe_slow );
 3786 %}
 3787 
 3788 instruct vconvF2HF(vec dst, vec src) %{
 3789   match(Set dst (VectorCastF2HF src));
 3790   format %{ "vector_conv_F2HF $dst $src" %}
 3791   ins_encode %{
 3792     int vlen_enc = vector_length_encoding(this, $src);
 3793     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3794   %}
 3795   ins_pipe( pipe_slow );
 3796 %}
 3797 
 3798 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3799   predicate(n->as_StoreVector()->memory_size() >= 16);
 3800   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3801   format %{ "vcvtps2ph $mem,$src" %}
 3802   ins_encode %{
 3803     int vlen_enc = vector_length_encoding(this, $src);
 3804     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3805   %}
 3806   ins_pipe( pipe_slow );
 3807 %}
 3808 
 3809 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3810   match(Set dst (ConvHF2F src));
 3811   format %{ "vcvtph2ps $dst,$src" %}
 3812   ins_encode %{
 3813     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3814   %}
 3815   ins_pipe( pipe_slow );
 3816 %}
 3817 
 3818 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3819   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3820   format %{ "vcvtph2ps $dst,$mem" %}
 3821   ins_encode %{
 3822     int vlen_enc = vector_length_encoding(this);
 3823     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3824   %}
 3825   ins_pipe( pipe_slow );
 3826 %}
 3827 
 3828 instruct vconvHF2F(vec dst, vec src) %{
 3829   match(Set dst (VectorCastHF2F src));
 3830   ins_cost(125);
 3831   format %{ "vector_conv_HF2F $dst,$src" %}
 3832   ins_encode %{
 3833     int vlen_enc = vector_length_encoding(this);
 3834     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3835   %}
 3836   ins_pipe( pipe_slow );
 3837 %}
 3838 
 3839 // ---------------------------------------- VectorReinterpret ------------------------------------
 3840 instruct reinterpret_mask(kReg dst) %{
 3841   predicate(n->bottom_type()->isa_vectmask() &&
 3842             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3843   match(Set dst (VectorReinterpret dst));
 3844   ins_cost(125);
 3845   format %{ "vector_reinterpret $dst\t!" %}
 3846   ins_encode %{
 3847     // empty
 3848   %}
 3849   ins_pipe( pipe_slow );
 3850 %}
 3851 
 3852 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3853   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3854             n->bottom_type()->isa_vectmask() &&
 3855             n->in(1)->bottom_type()->isa_vectmask() &&
 3856             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3857             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3858   match(Set dst (VectorReinterpret src));
 3859   effect(TEMP xtmp);
 3860   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3861   ins_encode %{
 3862      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3863      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3864      assert(src_sz == dst_sz , "src and dst size mismatch");
 3865      int vlen_enc = vector_length_encoding(src_sz);
 3866      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3867      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3868   %}
 3869   ins_pipe( pipe_slow );
 3870 %}
 3871 
 3872 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3873   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3874             n->bottom_type()->isa_vectmask() &&
 3875             n->in(1)->bottom_type()->isa_vectmask() &&
 3876             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3877              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3878             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3879   match(Set dst (VectorReinterpret src));
 3880   effect(TEMP xtmp);
 3881   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3882   ins_encode %{
 3883      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3884      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3885      assert(src_sz == dst_sz , "src and dst size mismatch");
 3886      int vlen_enc = vector_length_encoding(src_sz);
 3887      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3888      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3889   %}
 3890   ins_pipe( pipe_slow );
 3891 %}
 3892 
 3893 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3894   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3895             n->bottom_type()->isa_vectmask() &&
 3896             n->in(1)->bottom_type()->isa_vectmask() &&
 3897             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3898              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3899             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3900   match(Set dst (VectorReinterpret src));
 3901   effect(TEMP xtmp);
 3902   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3903   ins_encode %{
 3904      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3905      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3906      assert(src_sz == dst_sz , "src and dst size mismatch");
 3907      int vlen_enc = vector_length_encoding(src_sz);
 3908      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3909      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3910   %}
 3911   ins_pipe( pipe_slow );
 3912 %}
 3913 
 3914 instruct reinterpret(vec dst) %{
 3915   predicate(!n->bottom_type()->isa_vectmask() &&
 3916             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3917   match(Set dst (VectorReinterpret dst));
 3918   ins_cost(125);
 3919   format %{ "vector_reinterpret $dst\t!" %}
 3920   ins_encode %{
 3921     // empty
 3922   %}
 3923   ins_pipe( pipe_slow );
 3924 %}
 3925 
 3926 instruct reinterpret_expand(vec dst, vec src) %{
 3927   predicate(UseAVX == 0 &&
 3928             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3929   match(Set dst (VectorReinterpret src));
 3930   ins_cost(125);
 3931   effect(TEMP dst);
 3932   format %{ "vector_reinterpret_expand $dst,$src" %}
 3933   ins_encode %{
 3934     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3935     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3936 
 3937     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3938     if (src_vlen_in_bytes == 4) {
 3939       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3940     } else {
 3941       assert(src_vlen_in_bytes == 8, "");
 3942       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3943     }
 3944     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3945   %}
 3946   ins_pipe( pipe_slow );
 3947 %}
 3948 
 3949 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3950   predicate(UseAVX > 0 &&
 3951             !n->bottom_type()->isa_vectmask() &&
 3952             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3953             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3954   match(Set dst (VectorReinterpret src));
 3955   ins_cost(125);
 3956   format %{ "vector_reinterpret_expand $dst,$src" %}
 3957   ins_encode %{
 3958     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3959   %}
 3960   ins_pipe( pipe_slow );
 3961 %}
 3962 
 3963 
 3964 instruct vreinterpret_expand(legVec dst, vec src) %{
 3965   predicate(UseAVX > 0 &&
 3966             !n->bottom_type()->isa_vectmask() &&
 3967             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3968             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3969   match(Set dst (VectorReinterpret src));
 3970   ins_cost(125);
 3971   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3972   ins_encode %{
 3973     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3974       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3975       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3976       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3977       default: ShouldNotReachHere();
 3978     }
 3979   %}
 3980   ins_pipe( pipe_slow );
 3981 %}
 3982 
 3983 instruct reinterpret_shrink(vec dst, legVec src) %{
 3984   predicate(!n->bottom_type()->isa_vectmask() &&
 3985             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3986   match(Set dst (VectorReinterpret src));
 3987   ins_cost(125);
 3988   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3989   ins_encode %{
 3990     switch (Matcher::vector_length_in_bytes(this)) {
 3991       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3992       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3993       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3994       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3995       default: ShouldNotReachHere();
 3996     }
 3997   %}
 3998   ins_pipe( pipe_slow );
 3999 %}
 4000 
 4001 // ----------------------------------------------------------------------------------------------------
 4002 
 4003 #ifdef _LP64
 4004 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 4005   match(Set dst (RoundDoubleMode src rmode));
 4006   format %{ "roundsd $dst,$src" %}
 4007   ins_cost(150);
 4008   ins_encode %{
 4009     assert(UseSSE >= 4, "required");
 4010     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 4011       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4012     }
 4013     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 4014   %}
 4015   ins_pipe(pipe_slow);
 4016 %}
 4017 
 4018 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 4019   match(Set dst (RoundDoubleMode con rmode));
 4020   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 4021   ins_cost(150);
 4022   ins_encode %{
 4023     assert(UseSSE >= 4, "required");
 4024     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 4025   %}
 4026   ins_pipe(pipe_slow);
 4027 %}
 4028 
 4029 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 4030   predicate(Matcher::vector_length(n) < 8);
 4031   match(Set dst (RoundDoubleModeV src rmode));
 4032   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 4033   ins_encode %{
 4034     assert(UseAVX > 0, "required");
 4035     int vlen_enc = vector_length_encoding(this);
 4036     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 4037   %}
 4038   ins_pipe( pipe_slow );
 4039 %}
 4040 
 4041 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 4042   predicate(Matcher::vector_length(n) == 8);
 4043   match(Set dst (RoundDoubleModeV src rmode));
 4044   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 4045   ins_encode %{
 4046     assert(UseAVX > 2, "required");
 4047     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 4048   %}
 4049   ins_pipe( pipe_slow );
 4050 %}
 4051 
 4052 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 4053   predicate(Matcher::vector_length(n) < 8);
 4054   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4055   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 4056   ins_encode %{
 4057     assert(UseAVX > 0, "required");
 4058     int vlen_enc = vector_length_encoding(this);
 4059     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 4060   %}
 4061   ins_pipe( pipe_slow );
 4062 %}
 4063 
 4064 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 4065   predicate(Matcher::vector_length(n) == 8);
 4066   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4067   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 4068   ins_encode %{
 4069     assert(UseAVX > 2, "required");
 4070     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 4071   %}
 4072   ins_pipe( pipe_slow );
 4073 %}
 4074 #endif // _LP64
 4075 
 4076 instruct onspinwait() %{
 4077   match(OnSpinWait);
 4078   ins_cost(200);
 4079 
 4080   format %{
 4081     $$template
 4082     $$emit$$"pause\t! membar_onspinwait"
 4083   %}
 4084   ins_encode %{
 4085     __ pause();
 4086   %}
 4087   ins_pipe(pipe_slow);
 4088 %}
 4089 
 4090 // a * b + c
 4091 instruct fmaD_reg(regD a, regD b, regD c) %{
 4092   match(Set c (FmaD  c (Binary a b)));
 4093   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4094   ins_cost(150);
 4095   ins_encode %{
 4096     assert(UseFMA, "Needs FMA instructions support.");
 4097     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4098   %}
 4099   ins_pipe( pipe_slow );
 4100 %}
 4101 
 4102 // a * b + c
 4103 instruct fmaF_reg(regF a, regF b, regF c) %{
 4104   match(Set c (FmaF  c (Binary a b)));
 4105   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4106   ins_cost(150);
 4107   ins_encode %{
 4108     assert(UseFMA, "Needs FMA instructions support.");
 4109     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4110   %}
 4111   ins_pipe( pipe_slow );
 4112 %}
 4113 
 4114 // ====================VECTOR INSTRUCTIONS=====================================
 4115 
 4116 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4117 instruct MoveVec2Leg(legVec dst, vec src) %{
 4118   match(Set dst src);
 4119   format %{ "" %}
 4120   ins_encode %{
 4121     ShouldNotReachHere();
 4122   %}
 4123   ins_pipe( fpu_reg_reg );
 4124 %}
 4125 
 4126 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4127   match(Set dst src);
 4128   format %{ "" %}
 4129   ins_encode %{
 4130     ShouldNotReachHere();
 4131   %}
 4132   ins_pipe( fpu_reg_reg );
 4133 %}
 4134 
 4135 // ============================================================================
 4136 
 4137 // Load vectors generic operand pattern
 4138 instruct loadV(vec dst, memory mem) %{
 4139   match(Set dst (LoadVector mem));
 4140   ins_cost(125);
 4141   format %{ "load_vector $dst,$mem" %}
 4142   ins_encode %{
 4143     BasicType bt = Matcher::vector_element_basic_type(this);
 4144     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4145   %}
 4146   ins_pipe( pipe_slow );
 4147 %}
 4148 
 4149 // Store vectors generic operand pattern.
 4150 instruct storeV(memory mem, vec src) %{
 4151   match(Set mem (StoreVector mem src));
 4152   ins_cost(145);
 4153   format %{ "store_vector $mem,$src\n\t" %}
 4154   ins_encode %{
 4155     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4156       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4157       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4158       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4159       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4160       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4161       default: ShouldNotReachHere();
 4162     }
 4163   %}
 4164   ins_pipe( pipe_slow );
 4165 %}
 4166 
 4167 // ---------------------------------------- Gather ------------------------------------
 4168 
 4169 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4170 
 4171 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4172   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4173             Matcher::vector_length_in_bytes(n) <= 32);
 4174   match(Set dst (LoadVectorGather mem idx));
 4175   effect(TEMP dst, TEMP tmp, TEMP mask);
 4176   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4177   ins_encode %{
 4178     int vlen_enc = vector_length_encoding(this);
 4179     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4180     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4181     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4182     __ lea($tmp$$Register, $mem$$Address);
 4183     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4184   %}
 4185   ins_pipe( pipe_slow );
 4186 %}
 4187 
 4188 
 4189 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4190   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4191             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4192   match(Set dst (LoadVectorGather mem idx));
 4193   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4194   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4195   ins_encode %{
 4196     int vlen_enc = vector_length_encoding(this);
 4197     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4198     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4199     __ lea($tmp$$Register, $mem$$Address);
 4200     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4201   %}
 4202   ins_pipe( pipe_slow );
 4203 %}
 4204 
 4205 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4206   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4207             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4208   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4209   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4210   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4211   ins_encode %{
 4212     assert(UseAVX > 2, "sanity");
 4213     int vlen_enc = vector_length_encoding(this);
 4214     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4215     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4216     // Note: Since gather instruction partially updates the opmask register used
 4217     // for predication hense moving mask operand to a temporary.
 4218     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4219     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4220     __ lea($tmp$$Register, $mem$$Address);
 4221     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4222   %}
 4223   ins_pipe( pipe_slow );
 4224 %}
 4225 
 4226 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4227   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4228   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4229   effect(TEMP tmp, TEMP rtmp);
 4230   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4231   ins_encode %{
 4232     int vlen_enc = vector_length_encoding(this);
 4233     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4234     __ lea($tmp$$Register, $mem$$Address);
 4235     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4236   %}
 4237   ins_pipe( pipe_slow );
 4238 %}
 4239 
 4240 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4241                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4242   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4243   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4244   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4245   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4246   ins_encode %{
 4247     int vlen_enc = vector_length_encoding(this);
 4248     int vector_len = Matcher::vector_length(this);
 4249     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4250     __ lea($tmp$$Register, $mem$$Address);
 4251     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4252     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4253                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4254   %}
 4255   ins_pipe( pipe_slow );
 4256 %}
 4257 
 4258 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4259   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4260   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4261   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4262   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4263   ins_encode %{
 4264     int vlen_enc = vector_length_encoding(this);
 4265     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4266     __ lea($tmp$$Register, $mem$$Address);
 4267     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4268   %}
 4269   ins_pipe( pipe_slow );
 4270 %}
 4271 
 4272 
 4273 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4274                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4275   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4276   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4277   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4278   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4279   ins_encode %{
 4280     int vlen_enc = vector_length_encoding(this);
 4281     int vector_len = Matcher::vector_length(this);
 4282     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4283     __ lea($tmp$$Register, $mem$$Address);
 4284     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4285     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4286                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4287   %}
 4288   ins_pipe( pipe_slow );
 4289 %}
 4290 
 4291 
 4292 #ifdef _LP64
 4293 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4294   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4295   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4296   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4297   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4298   ins_encode %{
 4299     int vlen_enc = vector_length_encoding(this);
 4300     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4301     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4302     __ lea($tmp$$Register, $mem$$Address);
 4303     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4304     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4305   %}
 4306   ins_pipe( pipe_slow );
 4307 %}
 4308 
 4309 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4310                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4311   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4312   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4313   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4314   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4315   ins_encode %{
 4316     int vlen_enc = vector_length_encoding(this);
 4317     int vector_len = Matcher::vector_length(this);
 4318     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4319     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4320     __ lea($tmp$$Register, $mem$$Address);
 4321     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4322     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4323     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4324                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4325   %}
 4326   ins_pipe( pipe_slow );
 4327 %}
 4328 
 4329 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4330   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4331   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4332   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4333   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4334   ins_encode %{
 4335     int vlen_enc = vector_length_encoding(this);
 4336     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4337     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4338     __ lea($tmp$$Register, $mem$$Address);
 4339     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4340     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4341                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4342   %}
 4343   ins_pipe( pipe_slow );
 4344 %}
 4345 
 4346 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4347                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4348   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4349   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4350   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4351   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4352   ins_encode %{
 4353     int vlen_enc = vector_length_encoding(this);
 4354     int vector_len = Matcher::vector_length(this);
 4355     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4356     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4357     __ lea($tmp$$Register, $mem$$Address);
 4358     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4359     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4360     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4361                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4362   %}
 4363   ins_pipe( pipe_slow );
 4364 %}
 4365 
 4366 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4367   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4368   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4369   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4370   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4371   ins_encode %{
 4372     int vlen_enc = vector_length_encoding(this);
 4373     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4374     __ lea($tmp$$Register, $mem$$Address);
 4375     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4376     if (elem_bt == T_SHORT) {
 4377       __ movl($mask_idx$$Register, 0x55555555);
 4378       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4379     }
 4380     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4381     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4382   %}
 4383   ins_pipe( pipe_slow );
 4384 %}
 4385 
 4386 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4387                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4388   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4389   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4390   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4391   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4392   ins_encode %{
 4393     int vlen_enc = vector_length_encoding(this);
 4394     int vector_len = Matcher::vector_length(this);
 4395     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4396     __ lea($tmp$$Register, $mem$$Address);
 4397     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4398     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4399     if (elem_bt == T_SHORT) {
 4400       __ movl($mask_idx$$Register, 0x55555555);
 4401       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4402     }
 4403     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4404     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4405                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4406   %}
 4407   ins_pipe( pipe_slow );
 4408 %}
 4409 
 4410 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4411   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4412   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4413   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4414   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4415   ins_encode %{
 4416     int vlen_enc = vector_length_encoding(this);
 4417     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4418     __ lea($tmp$$Register, $mem$$Address);
 4419     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4420     if (elem_bt == T_SHORT) {
 4421       __ movl($mask_idx$$Register, 0x55555555);
 4422       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4423     }
 4424     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4425     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4426                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4427   %}
 4428   ins_pipe( pipe_slow );
 4429 %}
 4430 
 4431 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4432                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4433   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4434   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4435   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4436   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4437   ins_encode %{
 4438     int vlen_enc = vector_length_encoding(this);
 4439     int vector_len = Matcher::vector_length(this);
 4440     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4441     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4442     __ lea($tmp$$Register, $mem$$Address);
 4443     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4444     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4445     if (elem_bt == T_SHORT) {
 4446       __ movl($mask_idx$$Register, 0x55555555);
 4447       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4448     }
 4449     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4450     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4451                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4452   %}
 4453   ins_pipe( pipe_slow );
 4454 %}
 4455 #endif
 4456 
 4457 // ====================Scatter=======================================
 4458 
 4459 // Scatter INT, LONG, FLOAT, DOUBLE
 4460 
 4461 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4462   predicate(UseAVX > 2);
 4463   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4464   effect(TEMP tmp, TEMP ktmp);
 4465   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4466   ins_encode %{
 4467     int vlen_enc = vector_length_encoding(this, $src);
 4468     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4469 
 4470     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4471     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4472 
 4473     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4474     __ lea($tmp$$Register, $mem$$Address);
 4475     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4476   %}
 4477   ins_pipe( pipe_slow );
 4478 %}
 4479 
 4480 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4481   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4482   effect(TEMP tmp, TEMP ktmp);
 4483   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4484   ins_encode %{
 4485     int vlen_enc = vector_length_encoding(this, $src);
 4486     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4487     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4488     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4489     // Note: Since scatter instruction partially updates the opmask register used
 4490     // for predication hense moving mask operand to a temporary.
 4491     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4492     __ lea($tmp$$Register, $mem$$Address);
 4493     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4494   %}
 4495   ins_pipe( pipe_slow );
 4496 %}
 4497 
 4498 // ====================REPLICATE=======================================
 4499 
 4500 // Replicate byte scalar to be vector
 4501 instruct vReplB_reg(vec dst, rRegI src) %{
 4502   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4503   match(Set dst (Replicate src));
 4504   format %{ "replicateB $dst,$src" %}
 4505   ins_encode %{
 4506     uint vlen = Matcher::vector_length(this);
 4507     if (UseAVX >= 2) {
 4508       int vlen_enc = vector_length_encoding(this);
 4509       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4510         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4511         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4512       } else {
 4513         __ movdl($dst$$XMMRegister, $src$$Register);
 4514         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4515       }
 4516     } else {
 4517        assert(UseAVX < 2, "");
 4518       __ movdl($dst$$XMMRegister, $src$$Register);
 4519       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4520       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4521       if (vlen >= 16) {
 4522         assert(vlen == 16, "");
 4523         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4524       }
 4525     }
 4526   %}
 4527   ins_pipe( pipe_slow );
 4528 %}
 4529 
 4530 instruct ReplB_mem(vec dst, memory mem) %{
 4531   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4532   match(Set dst (Replicate (LoadB mem)));
 4533   format %{ "replicateB $dst,$mem" %}
 4534   ins_encode %{
 4535     int vlen_enc = vector_length_encoding(this);
 4536     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4537   %}
 4538   ins_pipe( pipe_slow );
 4539 %}
 4540 
 4541 // ====================ReplicateS=======================================
 4542 
 4543 instruct vReplS_reg(vec dst, rRegI src) %{
 4544   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4545   match(Set dst (Replicate src));
 4546   format %{ "replicateS $dst,$src" %}
 4547   ins_encode %{
 4548     uint vlen = Matcher::vector_length(this);
 4549     int vlen_enc = vector_length_encoding(this);
 4550     if (UseAVX >= 2) {
 4551       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4552         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4553         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4554       } else {
 4555         __ movdl($dst$$XMMRegister, $src$$Register);
 4556         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4557       }
 4558     } else {
 4559       assert(UseAVX < 2, "");
 4560       __ movdl($dst$$XMMRegister, $src$$Register);
 4561       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4562       if (vlen >= 8) {
 4563         assert(vlen == 8, "");
 4564         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4565       }
 4566     }
 4567   %}
 4568   ins_pipe( pipe_slow );
 4569 %}
 4570 
 4571 #ifdef _LP64
 4572 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4573   match(Set dst (Replicate con));
 4574   effect(TEMP rtmp);
 4575   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4576   ins_encode %{
 4577     int vlen_enc = vector_length_encoding(this);
 4578     BasicType bt = Matcher::vector_element_basic_type(this);
 4579     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4580     __ movl($rtmp$$Register, $con$$constant);
 4581     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4582   %}
 4583   ins_pipe( pipe_slow );
 4584 %}
 4585 
 4586 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4587   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4588   match(Set dst (Replicate src));
 4589   effect(TEMP rtmp);
 4590   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4591   ins_encode %{
 4592     int vlen_enc = vector_length_encoding(this);
 4593     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4594     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4595   %}
 4596   ins_pipe( pipe_slow );
 4597 %}
 4598 #endif
 4599 
 4600 instruct ReplS_mem(vec dst, memory mem) %{
 4601   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4602   match(Set dst (Replicate (LoadS mem)));
 4603   format %{ "replicateS $dst,$mem" %}
 4604   ins_encode %{
 4605     int vlen_enc = vector_length_encoding(this);
 4606     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4607   %}
 4608   ins_pipe( pipe_slow );
 4609 %}
 4610 
 4611 // ====================ReplicateI=======================================
 4612 
 4613 instruct ReplI_reg(vec dst, rRegI src) %{
 4614   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4615   match(Set dst (Replicate src));
 4616   format %{ "replicateI $dst,$src" %}
 4617   ins_encode %{
 4618     uint vlen = Matcher::vector_length(this);
 4619     int vlen_enc = vector_length_encoding(this);
 4620     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4621       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4622     } else if (VM_Version::supports_avx2()) {
 4623       __ movdl($dst$$XMMRegister, $src$$Register);
 4624       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4625     } else {
 4626       __ movdl($dst$$XMMRegister, $src$$Register);
 4627       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4628     }
 4629   %}
 4630   ins_pipe( pipe_slow );
 4631 %}
 4632 
 4633 instruct ReplI_mem(vec dst, memory mem) %{
 4634   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4635   match(Set dst (Replicate (LoadI mem)));
 4636   format %{ "replicateI $dst,$mem" %}
 4637   ins_encode %{
 4638     int vlen_enc = vector_length_encoding(this);
 4639     if (VM_Version::supports_avx2()) {
 4640       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4641     } else if (VM_Version::supports_avx()) {
 4642       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4643     } else {
 4644       __ movdl($dst$$XMMRegister, $mem$$Address);
 4645       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4646     }
 4647   %}
 4648   ins_pipe( pipe_slow );
 4649 %}
 4650 
 4651 instruct ReplI_imm(vec dst, immI con) %{
 4652   predicate(Matcher::is_non_long_integral_vector(n));
 4653   match(Set dst (Replicate con));
 4654   format %{ "replicateI $dst,$con" %}
 4655   ins_encode %{
 4656     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4657                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4658                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4659     BasicType bt = Matcher::vector_element_basic_type(this);
 4660     int vlen = Matcher::vector_length_in_bytes(this);
 4661     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4662   %}
 4663   ins_pipe( pipe_slow );
 4664 %}
 4665 
 4666 // Replicate scalar zero to be vector
 4667 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4668   predicate(Matcher::is_non_long_integral_vector(n));
 4669   match(Set dst (Replicate zero));
 4670   format %{ "replicateI $dst,$zero" %}
 4671   ins_encode %{
 4672     int vlen_enc = vector_length_encoding(this);
 4673     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4674       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4675     } else {
 4676       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4677     }
 4678   %}
 4679   ins_pipe( fpu_reg_reg );
 4680 %}
 4681 
 4682 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4683   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4684   match(Set dst (Replicate con));
 4685   format %{ "vallones $dst" %}
 4686   ins_encode %{
 4687     int vector_len = vector_length_encoding(this);
 4688     __ vallones($dst$$XMMRegister, vector_len);
 4689   %}
 4690   ins_pipe( pipe_slow );
 4691 %}
 4692 
 4693 // ====================ReplicateL=======================================
 4694 
 4695 #ifdef _LP64
 4696 // Replicate long (8 byte) scalar to be vector
 4697 instruct ReplL_reg(vec dst, rRegL src) %{
 4698   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4699   match(Set dst (Replicate src));
 4700   format %{ "replicateL $dst,$src" %}
 4701   ins_encode %{
 4702     int vlen = Matcher::vector_length(this);
 4703     int vlen_enc = vector_length_encoding(this);
 4704     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4705       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4706     } else if (VM_Version::supports_avx2()) {
 4707       __ movdq($dst$$XMMRegister, $src$$Register);
 4708       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4709     } else {
 4710       __ movdq($dst$$XMMRegister, $src$$Register);
 4711       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4712     }
 4713   %}
 4714   ins_pipe( pipe_slow );
 4715 %}
 4716 #else // _LP64
 4717 // Replicate long (8 byte) scalar to be vector
 4718 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4719   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4720   match(Set dst (Replicate src));
 4721   effect(TEMP dst, USE src, TEMP tmp);
 4722   format %{ "replicateL $dst,$src" %}
 4723   ins_encode %{
 4724     uint vlen = Matcher::vector_length(this);
 4725     if (vlen == 2) {
 4726       __ movdl($dst$$XMMRegister, $src$$Register);
 4727       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4728       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4729       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4730     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4731       int vlen_enc = Assembler::AVX_256bit;
 4732       __ movdl($dst$$XMMRegister, $src$$Register);
 4733       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4734       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4735       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4736     } else {
 4737       __ movdl($dst$$XMMRegister, $src$$Register);
 4738       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4739       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4740       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4741       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4742     }
 4743   %}
 4744   ins_pipe( pipe_slow );
 4745 %}
 4746 
 4747 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4748   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4749   match(Set dst (Replicate src));
 4750   effect(TEMP dst, USE src, TEMP tmp);
 4751   format %{ "replicateL $dst,$src" %}
 4752   ins_encode %{
 4753     if (VM_Version::supports_avx512vl()) {
 4754       __ movdl($dst$$XMMRegister, $src$$Register);
 4755       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4756       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4757       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4758       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4759       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4760     } else {
 4761       int vlen_enc = Assembler::AVX_512bit;
 4762       __ movdl($dst$$XMMRegister, $src$$Register);
 4763       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4764       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4765       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4766     }
 4767   %}
 4768   ins_pipe( pipe_slow );
 4769 %}
 4770 #endif // _LP64
 4771 
 4772 instruct ReplL_mem(vec dst, memory mem) %{
 4773   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4774   match(Set dst (Replicate (LoadL mem)));
 4775   format %{ "replicateL $dst,$mem" %}
 4776   ins_encode %{
 4777     int vlen_enc = vector_length_encoding(this);
 4778     if (VM_Version::supports_avx2()) {
 4779       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4780     } else if (VM_Version::supports_sse3()) {
 4781       __ movddup($dst$$XMMRegister, $mem$$Address);
 4782     } else {
 4783       __ movq($dst$$XMMRegister, $mem$$Address);
 4784       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4785     }
 4786   %}
 4787   ins_pipe( pipe_slow );
 4788 %}
 4789 
 4790 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4791 instruct ReplL_imm(vec dst, immL con) %{
 4792   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4793   match(Set dst (Replicate con));
 4794   format %{ "replicateL $dst,$con" %}
 4795   ins_encode %{
 4796     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4797     int vlen = Matcher::vector_length_in_bytes(this);
 4798     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4799   %}
 4800   ins_pipe( pipe_slow );
 4801 %}
 4802 
 4803 instruct ReplL_zero(vec dst, immL0 zero) %{
 4804   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4805   match(Set dst (Replicate zero));
 4806   format %{ "replicateL $dst,$zero" %}
 4807   ins_encode %{
 4808     int vlen_enc = vector_length_encoding(this);
 4809     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4810       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4811     } else {
 4812       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4813     }
 4814   %}
 4815   ins_pipe( fpu_reg_reg );
 4816 %}
 4817 
 4818 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4819   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4820   match(Set dst (Replicate con));
 4821   format %{ "vallones $dst" %}
 4822   ins_encode %{
 4823     int vector_len = vector_length_encoding(this);
 4824     __ vallones($dst$$XMMRegister, vector_len);
 4825   %}
 4826   ins_pipe( pipe_slow );
 4827 %}
 4828 
 4829 // ====================ReplicateF=======================================
 4830 
 4831 instruct vReplF_reg(vec dst, vlRegF src) %{
 4832   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4833   match(Set dst (Replicate src));
 4834   format %{ "replicateF $dst,$src" %}
 4835   ins_encode %{
 4836     uint vlen = Matcher::vector_length(this);
 4837     int vlen_enc = vector_length_encoding(this);
 4838     if (vlen <= 4) {
 4839       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4840     } else if (VM_Version::supports_avx2()) {
 4841       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4842     } else {
 4843       assert(vlen == 8, "sanity");
 4844       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4845       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4846     }
 4847   %}
 4848   ins_pipe( pipe_slow );
 4849 %}
 4850 
 4851 instruct ReplF_reg(vec dst, vlRegF src) %{
 4852   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4853   match(Set dst (Replicate src));
 4854   format %{ "replicateF $dst,$src" %}
 4855   ins_encode %{
 4856     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4857   %}
 4858   ins_pipe( pipe_slow );
 4859 %}
 4860 
 4861 instruct ReplF_mem(vec dst, memory mem) %{
 4862   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4863   match(Set dst (Replicate (LoadF mem)));
 4864   format %{ "replicateF $dst,$mem" %}
 4865   ins_encode %{
 4866     int vlen_enc = vector_length_encoding(this);
 4867     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4868   %}
 4869   ins_pipe( pipe_slow );
 4870 %}
 4871 
 4872 // Replicate float scalar immediate to be vector by loading from const table.
 4873 instruct ReplF_imm(vec dst, immF con) %{
 4874   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4875   match(Set dst (Replicate con));
 4876   format %{ "replicateF $dst,$con" %}
 4877   ins_encode %{
 4878     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4879                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4880     int vlen = Matcher::vector_length_in_bytes(this);
 4881     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4882   %}
 4883   ins_pipe( pipe_slow );
 4884 %}
 4885 
 4886 instruct ReplF_zero(vec dst, immF0 zero) %{
 4887   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4888   match(Set dst (Replicate zero));
 4889   format %{ "replicateF $dst,$zero" %}
 4890   ins_encode %{
 4891     int vlen_enc = vector_length_encoding(this);
 4892     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4893       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4894     } else {
 4895       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4896     }
 4897   %}
 4898   ins_pipe( fpu_reg_reg );
 4899 %}
 4900 
 4901 // ====================ReplicateD=======================================
 4902 
 4903 // Replicate double (8 bytes) scalar to be vector
 4904 instruct vReplD_reg(vec dst, vlRegD src) %{
 4905   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4906   match(Set dst (Replicate src));
 4907   format %{ "replicateD $dst,$src" %}
 4908   ins_encode %{
 4909     uint vlen = Matcher::vector_length(this);
 4910     int vlen_enc = vector_length_encoding(this);
 4911     if (vlen <= 2) {
 4912       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4913     } else if (VM_Version::supports_avx2()) {
 4914       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4915     } else {
 4916       assert(vlen == 4, "sanity");
 4917       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4918       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4919     }
 4920   %}
 4921   ins_pipe( pipe_slow );
 4922 %}
 4923 
 4924 instruct ReplD_reg(vec dst, vlRegD src) %{
 4925   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4926   match(Set dst (Replicate src));
 4927   format %{ "replicateD $dst,$src" %}
 4928   ins_encode %{
 4929     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4930   %}
 4931   ins_pipe( pipe_slow );
 4932 %}
 4933 
 4934 instruct ReplD_mem(vec dst, memory mem) %{
 4935   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4936   match(Set dst (Replicate (LoadD mem)));
 4937   format %{ "replicateD $dst,$mem" %}
 4938   ins_encode %{
 4939     if (Matcher::vector_length(this) >= 4) {
 4940       int vlen_enc = vector_length_encoding(this);
 4941       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4942     } else {
 4943       __ movddup($dst$$XMMRegister, $mem$$Address);
 4944     }
 4945   %}
 4946   ins_pipe( pipe_slow );
 4947 %}
 4948 
 4949 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4950 instruct ReplD_imm(vec dst, immD con) %{
 4951   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4952   match(Set dst (Replicate con));
 4953   format %{ "replicateD $dst,$con" %}
 4954   ins_encode %{
 4955     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4956     int vlen = Matcher::vector_length_in_bytes(this);
 4957     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4958   %}
 4959   ins_pipe( pipe_slow );
 4960 %}
 4961 
 4962 instruct ReplD_zero(vec dst, immD0 zero) %{
 4963   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4964   match(Set dst (Replicate zero));
 4965   format %{ "replicateD $dst,$zero" %}
 4966   ins_encode %{
 4967     int vlen_enc = vector_length_encoding(this);
 4968     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4969       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4970     } else {
 4971       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4972     }
 4973   %}
 4974   ins_pipe( fpu_reg_reg );
 4975 %}
 4976 
 4977 // ====================VECTOR INSERT=======================================
 4978 
 4979 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4980   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4981   match(Set dst (VectorInsert (Binary dst val) idx));
 4982   format %{ "vector_insert $dst,$val,$idx" %}
 4983   ins_encode %{
 4984     assert(UseSSE >= 4, "required");
 4985     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4986 
 4987     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4988 
 4989     assert(is_integral_type(elem_bt), "");
 4990     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4991 
 4992     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4993   %}
 4994   ins_pipe( pipe_slow );
 4995 %}
 4996 
 4997 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4998   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4999   match(Set dst (VectorInsert (Binary src val) idx));
 5000   effect(TEMP vtmp);
 5001   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5002   ins_encode %{
 5003     int vlen_enc = Assembler::AVX_256bit;
 5004     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 5005     int elem_per_lane = 16/type2aelembytes(elem_bt);
 5006     int log2epr = log2(elem_per_lane);
 5007 
 5008     assert(is_integral_type(elem_bt), "sanity");
 5009     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5010 
 5011     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 5012     uint y_idx = ($idx$$constant >> log2epr) & 1;
 5013     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5014     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 5015     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5016   %}
 5017   ins_pipe( pipe_slow );
 5018 %}
 5019 
 5020 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 5021   predicate(Matcher::vector_length_in_bytes(n) == 64);
 5022   match(Set dst (VectorInsert (Binary src val) idx));
 5023   effect(TEMP vtmp);
 5024   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5025   ins_encode %{
 5026     assert(UseAVX > 2, "sanity");
 5027 
 5028     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 5029     int elem_per_lane = 16/type2aelembytes(elem_bt);
 5030     int log2epr = log2(elem_per_lane);
 5031 
 5032     assert(is_integral_type(elem_bt), "");
 5033     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5034 
 5035     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 5036     uint y_idx = ($idx$$constant >> log2epr) & 3;
 5037     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5038     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 5039     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5040   %}
 5041   ins_pipe( pipe_slow );
 5042 %}
 5043 
 5044 #ifdef _LP64
 5045 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 5046   predicate(Matcher::vector_length(n) == 2);
 5047   match(Set dst (VectorInsert (Binary dst val) idx));
 5048   format %{ "vector_insert $dst,$val,$idx" %}
 5049   ins_encode %{
 5050     assert(UseSSE >= 4, "required");
 5051     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 5052     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5053 
 5054     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 5055   %}
 5056   ins_pipe( pipe_slow );
 5057 %}
 5058 
 5059 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 5060   predicate(Matcher::vector_length(n) == 4);
 5061   match(Set dst (VectorInsert (Binary src val) idx));
 5062   effect(TEMP vtmp);
 5063   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5064   ins_encode %{
 5065     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 5066     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5067 
 5068     uint x_idx = $idx$$constant & right_n_bits(1);
 5069     uint y_idx = ($idx$$constant >> 1) & 1;
 5070     int vlen_enc = Assembler::AVX_256bit;
 5071     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5072     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 5073     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5074   %}
 5075   ins_pipe( pipe_slow );
 5076 %}
 5077 
 5078 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 5079   predicate(Matcher::vector_length(n) == 8);
 5080   match(Set dst (VectorInsert (Binary src val) idx));
 5081   effect(TEMP vtmp);
 5082   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5083   ins_encode %{
 5084     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 5085     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5086 
 5087     uint x_idx = $idx$$constant & right_n_bits(1);
 5088     uint y_idx = ($idx$$constant >> 1) & 3;
 5089     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5090     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 5091     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5092   %}
 5093   ins_pipe( pipe_slow );
 5094 %}
 5095 #endif
 5096 
 5097 instruct insertF(vec dst, regF val, immU8 idx) %{
 5098   predicate(Matcher::vector_length(n) < 8);
 5099   match(Set dst (VectorInsert (Binary dst val) idx));
 5100   format %{ "vector_insert $dst,$val,$idx" %}
 5101   ins_encode %{
 5102     assert(UseSSE >= 4, "sanity");
 5103 
 5104     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5105     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5106 
 5107     uint x_idx = $idx$$constant & right_n_bits(2);
 5108     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5109   %}
 5110   ins_pipe( pipe_slow );
 5111 %}
 5112 
 5113 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 5114   predicate(Matcher::vector_length(n) >= 8);
 5115   match(Set dst (VectorInsert (Binary src val) idx));
 5116   effect(TEMP vtmp);
 5117   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5118   ins_encode %{
 5119     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5120     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5121 
 5122     int vlen = Matcher::vector_length(this);
 5123     uint x_idx = $idx$$constant & right_n_bits(2);
 5124     if (vlen == 8) {
 5125       uint y_idx = ($idx$$constant >> 2) & 1;
 5126       int vlen_enc = Assembler::AVX_256bit;
 5127       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5128       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5129       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5130     } else {
 5131       assert(vlen == 16, "sanity");
 5132       uint y_idx = ($idx$$constant >> 2) & 3;
 5133       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5134       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5135       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5136     }
 5137   %}
 5138   ins_pipe( pipe_slow );
 5139 %}
 5140 
 5141 #ifdef _LP64
 5142 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 5143   predicate(Matcher::vector_length(n) == 2);
 5144   match(Set dst (VectorInsert (Binary dst val) idx));
 5145   effect(TEMP tmp);
 5146   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 5147   ins_encode %{
 5148     assert(UseSSE >= 4, "sanity");
 5149     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5150     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5151 
 5152     __ movq($tmp$$Register, $val$$XMMRegister);
 5153     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 5154   %}
 5155   ins_pipe( pipe_slow );
 5156 %}
 5157 
 5158 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 5159   predicate(Matcher::vector_length(n) == 4);
 5160   match(Set dst (VectorInsert (Binary src val) idx));
 5161   effect(TEMP vtmp, TEMP tmp);
 5162   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 5163   ins_encode %{
 5164     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5165     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5166 
 5167     uint x_idx = $idx$$constant & right_n_bits(1);
 5168     uint y_idx = ($idx$$constant >> 1) & 1;
 5169     int vlen_enc = Assembler::AVX_256bit;
 5170     __ movq($tmp$$Register, $val$$XMMRegister);
 5171     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5172     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5173     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5174   %}
 5175   ins_pipe( pipe_slow );
 5176 %}
 5177 
 5178 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 5179   predicate(Matcher::vector_length(n) == 8);
 5180   match(Set dst (VectorInsert (Binary src val) idx));
 5181   effect(TEMP tmp, TEMP vtmp);
 5182   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5183   ins_encode %{
 5184     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5185     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5186 
 5187     uint x_idx = $idx$$constant & right_n_bits(1);
 5188     uint y_idx = ($idx$$constant >> 1) & 3;
 5189     __ movq($tmp$$Register, $val$$XMMRegister);
 5190     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5191     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5192     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5193   %}
 5194   ins_pipe( pipe_slow );
 5195 %}
 5196 #endif
 5197 
 5198 // ====================REDUCTION ARITHMETIC=======================================
 5199 
 5200 // =======================Int Reduction==========================================
 5201 
 5202 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5203   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 5204   match(Set dst (AddReductionVI src1 src2));
 5205   match(Set dst (MulReductionVI src1 src2));
 5206   match(Set dst (AndReductionV  src1 src2));
 5207   match(Set dst ( OrReductionV  src1 src2));
 5208   match(Set dst (XorReductionV  src1 src2));
 5209   match(Set dst (MinReductionV  src1 src2));
 5210   match(Set dst (MaxReductionV  src1 src2));
 5211   effect(TEMP vtmp1, TEMP vtmp2);
 5212   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5213   ins_encode %{
 5214     int opcode = this->ideal_Opcode();
 5215     int vlen = Matcher::vector_length(this, $src2);
 5216     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5217   %}
 5218   ins_pipe( pipe_slow );
 5219 %}
 5220 
 5221 // =======================Long Reduction==========================================
 5222 
 5223 #ifdef _LP64
 5224 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5225   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 5226   match(Set dst (AddReductionVL src1 src2));
 5227   match(Set dst (MulReductionVL src1 src2));
 5228   match(Set dst (AndReductionV  src1 src2));
 5229   match(Set dst ( OrReductionV  src1 src2));
 5230   match(Set dst (XorReductionV  src1 src2));
 5231   match(Set dst (MinReductionV  src1 src2));
 5232   match(Set dst (MaxReductionV  src1 src2));
 5233   effect(TEMP vtmp1, TEMP vtmp2);
 5234   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5235   ins_encode %{
 5236     int opcode = this->ideal_Opcode();
 5237     int vlen = Matcher::vector_length(this, $src2);
 5238     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5239   %}
 5240   ins_pipe( pipe_slow );
 5241 %}
 5242 
 5243 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5244   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5245   match(Set dst (AddReductionVL src1 src2));
 5246   match(Set dst (MulReductionVL src1 src2));
 5247   match(Set dst (AndReductionV  src1 src2));
 5248   match(Set dst ( OrReductionV  src1 src2));
 5249   match(Set dst (XorReductionV  src1 src2));
 5250   match(Set dst (MinReductionV  src1 src2));
 5251   match(Set dst (MaxReductionV  src1 src2));
 5252   effect(TEMP vtmp1, TEMP vtmp2);
 5253   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5254   ins_encode %{
 5255     int opcode = this->ideal_Opcode();
 5256     int vlen = Matcher::vector_length(this, $src2);
 5257     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5258   %}
 5259   ins_pipe( pipe_slow );
 5260 %}
 5261 #endif // _LP64
 5262 
 5263 // =======================Float Reduction==========================================
 5264 
 5265 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5266   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5267   match(Set dst (AddReductionVF dst src));
 5268   match(Set dst (MulReductionVF dst src));
 5269   effect(TEMP dst, TEMP vtmp);
 5270   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5271   ins_encode %{
 5272     int opcode = this->ideal_Opcode();
 5273     int vlen = Matcher::vector_length(this, $src);
 5274     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5275   %}
 5276   ins_pipe( pipe_slow );
 5277 %}
 5278 
 5279 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5280   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5281   match(Set dst (AddReductionVF dst src));
 5282   match(Set dst (MulReductionVF dst src));
 5283   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5284   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5285   ins_encode %{
 5286     int opcode = this->ideal_Opcode();
 5287     int vlen = Matcher::vector_length(this, $src);
 5288     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5289   %}
 5290   ins_pipe( pipe_slow );
 5291 %}
 5292 
 5293 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5294   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5295   match(Set dst (AddReductionVF dst src));
 5296   match(Set dst (MulReductionVF dst src));
 5297   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5298   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5299   ins_encode %{
 5300     int opcode = this->ideal_Opcode();
 5301     int vlen = Matcher::vector_length(this, $src);
 5302     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5303   %}
 5304   ins_pipe( pipe_slow );
 5305 %}
 5306 
 5307 
 5308 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5309   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5310   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5311   // src1 contains reduction identity
 5312   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5313   match(Set dst (AddReductionVF src1 src2));
 5314   match(Set dst (MulReductionVF src1 src2));
 5315   effect(TEMP dst);
 5316   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5317   ins_encode %{
 5318     int opcode = this->ideal_Opcode();
 5319     int vlen = Matcher::vector_length(this, $src2);
 5320     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5321   %}
 5322   ins_pipe( pipe_slow );
 5323 %}
 5324 
 5325 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5326   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5327   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5328   // src1 contains reduction identity
 5329   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5330   match(Set dst (AddReductionVF src1 src2));
 5331   match(Set dst (MulReductionVF src1 src2));
 5332   effect(TEMP dst, TEMP vtmp);
 5333   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5334   ins_encode %{
 5335     int opcode = this->ideal_Opcode();
 5336     int vlen = Matcher::vector_length(this, $src2);
 5337     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5338   %}
 5339   ins_pipe( pipe_slow );
 5340 %}
 5341 
 5342 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5343   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5344   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5345   // src1 contains reduction identity
 5346   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5347   match(Set dst (AddReductionVF src1 src2));
 5348   match(Set dst (MulReductionVF src1 src2));
 5349   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5350   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5351   ins_encode %{
 5352     int opcode = this->ideal_Opcode();
 5353     int vlen = Matcher::vector_length(this, $src2);
 5354     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5355   %}
 5356   ins_pipe( pipe_slow );
 5357 %}
 5358 
 5359 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5360   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5361   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5362   // src1 contains reduction identity
 5363   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5364   match(Set dst (AddReductionVF src1 src2));
 5365   match(Set dst (MulReductionVF src1 src2));
 5366   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5367   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5368   ins_encode %{
 5369     int opcode = this->ideal_Opcode();
 5370     int vlen = Matcher::vector_length(this, $src2);
 5371     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5372   %}
 5373   ins_pipe( pipe_slow );
 5374 %}
 5375 
 5376 // =======================Double Reduction==========================================
 5377 
 5378 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5379   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5380   match(Set dst (AddReductionVD dst src));
 5381   match(Set dst (MulReductionVD dst src));
 5382   effect(TEMP dst, TEMP vtmp);
 5383   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5384   ins_encode %{
 5385     int opcode = this->ideal_Opcode();
 5386     int vlen = Matcher::vector_length(this, $src);
 5387     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5388 %}
 5389   ins_pipe( pipe_slow );
 5390 %}
 5391 
 5392 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5393   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5394   match(Set dst (AddReductionVD dst src));
 5395   match(Set dst (MulReductionVD dst src));
 5396   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5397   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5398   ins_encode %{
 5399     int opcode = this->ideal_Opcode();
 5400     int vlen = Matcher::vector_length(this, $src);
 5401     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5402   %}
 5403   ins_pipe( pipe_slow );
 5404 %}
 5405 
 5406 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5407   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5408   match(Set dst (AddReductionVD dst src));
 5409   match(Set dst (MulReductionVD dst src));
 5410   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5411   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5412   ins_encode %{
 5413     int opcode = this->ideal_Opcode();
 5414     int vlen = Matcher::vector_length(this, $src);
 5415     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5416   %}
 5417   ins_pipe( pipe_slow );
 5418 %}
 5419 
 5420 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5421   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5422   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5423   // src1 contains reduction identity
 5424   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5425   match(Set dst (AddReductionVD src1 src2));
 5426   match(Set dst (MulReductionVD src1 src2));
 5427   effect(TEMP dst);
 5428   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5429   ins_encode %{
 5430     int opcode = this->ideal_Opcode();
 5431     int vlen = Matcher::vector_length(this, $src2);
 5432     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5433 %}
 5434   ins_pipe( pipe_slow );
 5435 %}
 5436 
 5437 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5438   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5439   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5440   // src1 contains reduction identity
 5441   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5442   match(Set dst (AddReductionVD src1 src2));
 5443   match(Set dst (MulReductionVD src1 src2));
 5444   effect(TEMP dst, TEMP vtmp);
 5445   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5446   ins_encode %{
 5447     int opcode = this->ideal_Opcode();
 5448     int vlen = Matcher::vector_length(this, $src2);
 5449     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5450   %}
 5451   ins_pipe( pipe_slow );
 5452 %}
 5453 
 5454 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5455   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5456   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5457   // src1 contains reduction identity
 5458   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5459   match(Set dst (AddReductionVD src1 src2));
 5460   match(Set dst (MulReductionVD src1 src2));
 5461   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5462   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5463   ins_encode %{
 5464     int opcode = this->ideal_Opcode();
 5465     int vlen = Matcher::vector_length(this, $src2);
 5466     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5467   %}
 5468   ins_pipe( pipe_slow );
 5469 %}
 5470 
 5471 // =======================Byte Reduction==========================================
 5472 
 5473 #ifdef _LP64
 5474 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5475   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5476   match(Set dst (AddReductionVI src1 src2));
 5477   match(Set dst (AndReductionV  src1 src2));
 5478   match(Set dst ( OrReductionV  src1 src2));
 5479   match(Set dst (XorReductionV  src1 src2));
 5480   match(Set dst (MinReductionV  src1 src2));
 5481   match(Set dst (MaxReductionV  src1 src2));
 5482   effect(TEMP vtmp1, TEMP vtmp2);
 5483   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5484   ins_encode %{
 5485     int opcode = this->ideal_Opcode();
 5486     int vlen = Matcher::vector_length(this, $src2);
 5487     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5488   %}
 5489   ins_pipe( pipe_slow );
 5490 %}
 5491 
 5492 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5493   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5494   match(Set dst (AddReductionVI src1 src2));
 5495   match(Set dst (AndReductionV  src1 src2));
 5496   match(Set dst ( OrReductionV  src1 src2));
 5497   match(Set dst (XorReductionV  src1 src2));
 5498   match(Set dst (MinReductionV  src1 src2));
 5499   match(Set dst (MaxReductionV  src1 src2));
 5500   effect(TEMP vtmp1, TEMP vtmp2);
 5501   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5502   ins_encode %{
 5503     int opcode = this->ideal_Opcode();
 5504     int vlen = Matcher::vector_length(this, $src2);
 5505     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5506   %}
 5507   ins_pipe( pipe_slow );
 5508 %}
 5509 #endif
 5510 
 5511 // =======================Short Reduction==========================================
 5512 
 5513 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5514   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5515   match(Set dst (AddReductionVI src1 src2));
 5516   match(Set dst (MulReductionVI src1 src2));
 5517   match(Set dst (AndReductionV  src1 src2));
 5518   match(Set dst ( OrReductionV  src1 src2));
 5519   match(Set dst (XorReductionV  src1 src2));
 5520   match(Set dst (MinReductionV  src1 src2));
 5521   match(Set dst (MaxReductionV  src1 src2));
 5522   effect(TEMP vtmp1, TEMP vtmp2);
 5523   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5524   ins_encode %{
 5525     int opcode = this->ideal_Opcode();
 5526     int vlen = Matcher::vector_length(this, $src2);
 5527     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5528   %}
 5529   ins_pipe( pipe_slow );
 5530 %}
 5531 
 5532 // =======================Mul Reduction==========================================
 5533 
 5534 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5535   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5536             Matcher::vector_length(n->in(2)) <= 32); // src2
 5537   match(Set dst (MulReductionVI src1 src2));
 5538   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5539   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5540   ins_encode %{
 5541     int opcode = this->ideal_Opcode();
 5542     int vlen = Matcher::vector_length(this, $src2);
 5543     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5544   %}
 5545   ins_pipe( pipe_slow );
 5546 %}
 5547 
 5548 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5549   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5550             Matcher::vector_length(n->in(2)) == 64); // src2
 5551   match(Set dst (MulReductionVI src1 src2));
 5552   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5553   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5554   ins_encode %{
 5555     int opcode = this->ideal_Opcode();
 5556     int vlen = Matcher::vector_length(this, $src2);
 5557     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5558   %}
 5559   ins_pipe( pipe_slow );
 5560 %}
 5561 
 5562 //--------------------Min/Max Float Reduction --------------------
 5563 // Float Min Reduction
 5564 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5565                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5566   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5567             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5568              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5569             Matcher::vector_length(n->in(2)) == 2);
 5570   match(Set dst (MinReductionV src1 src2));
 5571   match(Set dst (MaxReductionV src1 src2));
 5572   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5573   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5574   ins_encode %{
 5575     assert(UseAVX > 0, "sanity");
 5576 
 5577     int opcode = this->ideal_Opcode();
 5578     int vlen = Matcher::vector_length(this, $src2);
 5579     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5580                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5581   %}
 5582   ins_pipe( pipe_slow );
 5583 %}
 5584 
 5585 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5586                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5587   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5588             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5589              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5590             Matcher::vector_length(n->in(2)) >= 4);
 5591   match(Set dst (MinReductionV src1 src2));
 5592   match(Set dst (MaxReductionV src1 src2));
 5593   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5594   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5595   ins_encode %{
 5596     assert(UseAVX > 0, "sanity");
 5597 
 5598     int opcode = this->ideal_Opcode();
 5599     int vlen = Matcher::vector_length(this, $src2);
 5600     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5601                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5602   %}
 5603   ins_pipe( pipe_slow );
 5604 %}
 5605 
 5606 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5607                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5608   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5609             Matcher::vector_length(n->in(2)) == 2);
 5610   match(Set dst (MinReductionV dst src));
 5611   match(Set dst (MaxReductionV dst src));
 5612   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5613   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5614   ins_encode %{
 5615     assert(UseAVX > 0, "sanity");
 5616 
 5617     int opcode = this->ideal_Opcode();
 5618     int vlen = Matcher::vector_length(this, $src);
 5619     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5620                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5621   %}
 5622   ins_pipe( pipe_slow );
 5623 %}
 5624 
 5625 
 5626 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5627                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5628   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5629             Matcher::vector_length(n->in(2)) >= 4);
 5630   match(Set dst (MinReductionV dst src));
 5631   match(Set dst (MaxReductionV dst src));
 5632   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5633   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5634   ins_encode %{
 5635     assert(UseAVX > 0, "sanity");
 5636 
 5637     int opcode = this->ideal_Opcode();
 5638     int vlen = Matcher::vector_length(this, $src);
 5639     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5640                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5641   %}
 5642   ins_pipe( pipe_slow );
 5643 %}
 5644 
 5645 
 5646 //--------------------Min Double Reduction --------------------
 5647 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5648                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5649                             rFlagsReg cr) %{
 5650   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5651             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5652              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5653             Matcher::vector_length(n->in(2)) == 2);
 5654   match(Set dst (MinReductionV src1 src2));
 5655   match(Set dst (MaxReductionV src1 src2));
 5656   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5657   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5658   ins_encode %{
 5659     assert(UseAVX > 0, "sanity");
 5660 
 5661     int opcode = this->ideal_Opcode();
 5662     int vlen = Matcher::vector_length(this, $src2);
 5663     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5664                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5665   %}
 5666   ins_pipe( pipe_slow );
 5667 %}
 5668 
 5669 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5670                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5671                            rFlagsReg cr) %{
 5672   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5673             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5674              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5675             Matcher::vector_length(n->in(2)) >= 4);
 5676   match(Set dst (MinReductionV src1 src2));
 5677   match(Set dst (MaxReductionV src1 src2));
 5678   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5679   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5680   ins_encode %{
 5681     assert(UseAVX > 0, "sanity");
 5682 
 5683     int opcode = this->ideal_Opcode();
 5684     int vlen = Matcher::vector_length(this, $src2);
 5685     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5686                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5687   %}
 5688   ins_pipe( pipe_slow );
 5689 %}
 5690 
 5691 
 5692 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5693                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5694                                rFlagsReg cr) %{
 5695   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5696             Matcher::vector_length(n->in(2)) == 2);
 5697   match(Set dst (MinReductionV dst src));
 5698   match(Set dst (MaxReductionV dst src));
 5699   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5700   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5701   ins_encode %{
 5702     assert(UseAVX > 0, "sanity");
 5703 
 5704     int opcode = this->ideal_Opcode();
 5705     int vlen = Matcher::vector_length(this, $src);
 5706     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5707                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5708   %}
 5709   ins_pipe( pipe_slow );
 5710 %}
 5711 
 5712 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5713                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5714                               rFlagsReg cr) %{
 5715   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5716             Matcher::vector_length(n->in(2)) >= 4);
 5717   match(Set dst (MinReductionV dst src));
 5718   match(Set dst (MaxReductionV dst src));
 5719   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5720   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5721   ins_encode %{
 5722     assert(UseAVX > 0, "sanity");
 5723 
 5724     int opcode = this->ideal_Opcode();
 5725     int vlen = Matcher::vector_length(this, $src);
 5726     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5727                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5728   %}
 5729   ins_pipe( pipe_slow );
 5730 %}
 5731 
 5732 // ====================VECTOR ARITHMETIC=======================================
 5733 
 5734 // --------------------------------- ADD --------------------------------------
 5735 
 5736 // Bytes vector add
 5737 instruct vaddB(vec dst, vec src) %{
 5738   predicate(UseAVX == 0);
 5739   match(Set dst (AddVB dst src));
 5740   format %{ "paddb   $dst,$src\t! add packedB" %}
 5741   ins_encode %{
 5742     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5743   %}
 5744   ins_pipe( pipe_slow );
 5745 %}
 5746 
 5747 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5748   predicate(UseAVX > 0);
 5749   match(Set dst (AddVB src1 src2));
 5750   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5751   ins_encode %{
 5752     int vlen_enc = vector_length_encoding(this);
 5753     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5754   %}
 5755   ins_pipe( pipe_slow );
 5756 %}
 5757 
 5758 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5759   predicate((UseAVX > 0) &&
 5760             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5761   match(Set dst (AddVB src (LoadVector mem)));
 5762   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5763   ins_encode %{
 5764     int vlen_enc = vector_length_encoding(this);
 5765     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5766   %}
 5767   ins_pipe( pipe_slow );
 5768 %}
 5769 
 5770 // Shorts/Chars vector add
 5771 instruct vaddS(vec dst, vec src) %{
 5772   predicate(UseAVX == 0);
 5773   match(Set dst (AddVS dst src));
 5774   format %{ "paddw   $dst,$src\t! add packedS" %}
 5775   ins_encode %{
 5776     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5777   %}
 5778   ins_pipe( pipe_slow );
 5779 %}
 5780 
 5781 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5782   predicate(UseAVX > 0);
 5783   match(Set dst (AddVS src1 src2));
 5784   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5785   ins_encode %{
 5786     int vlen_enc = vector_length_encoding(this);
 5787     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5788   %}
 5789   ins_pipe( pipe_slow );
 5790 %}
 5791 
 5792 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5793   predicate((UseAVX > 0) &&
 5794             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5795   match(Set dst (AddVS src (LoadVector mem)));
 5796   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5797   ins_encode %{
 5798     int vlen_enc = vector_length_encoding(this);
 5799     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5800   %}
 5801   ins_pipe( pipe_slow );
 5802 %}
 5803 
 5804 // Integers vector add
 5805 instruct vaddI(vec dst, vec src) %{
 5806   predicate(UseAVX == 0);
 5807   match(Set dst (AddVI dst src));
 5808   format %{ "paddd   $dst,$src\t! add packedI" %}
 5809   ins_encode %{
 5810     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5811   %}
 5812   ins_pipe( pipe_slow );
 5813 %}
 5814 
 5815 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5816   predicate(UseAVX > 0);
 5817   match(Set dst (AddVI src1 src2));
 5818   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5819   ins_encode %{
 5820     int vlen_enc = vector_length_encoding(this);
 5821     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5822   %}
 5823   ins_pipe( pipe_slow );
 5824 %}
 5825 
 5826 
 5827 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5828   predicate((UseAVX > 0) &&
 5829             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5830   match(Set dst (AddVI src (LoadVector mem)));
 5831   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5832   ins_encode %{
 5833     int vlen_enc = vector_length_encoding(this);
 5834     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5835   %}
 5836   ins_pipe( pipe_slow );
 5837 %}
 5838 
 5839 // Longs vector add
 5840 instruct vaddL(vec dst, vec src) %{
 5841   predicate(UseAVX == 0);
 5842   match(Set dst (AddVL dst src));
 5843   format %{ "paddq   $dst,$src\t! add packedL" %}
 5844   ins_encode %{
 5845     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5846   %}
 5847   ins_pipe( pipe_slow );
 5848 %}
 5849 
 5850 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5851   predicate(UseAVX > 0);
 5852   match(Set dst (AddVL src1 src2));
 5853   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5854   ins_encode %{
 5855     int vlen_enc = vector_length_encoding(this);
 5856     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5857   %}
 5858   ins_pipe( pipe_slow );
 5859 %}
 5860 
 5861 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5862   predicate((UseAVX > 0) &&
 5863             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5864   match(Set dst (AddVL src (LoadVector mem)));
 5865   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5866   ins_encode %{
 5867     int vlen_enc = vector_length_encoding(this);
 5868     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5869   %}
 5870   ins_pipe( pipe_slow );
 5871 %}
 5872 
 5873 // Floats vector add
 5874 instruct vaddF(vec dst, vec src) %{
 5875   predicate(UseAVX == 0);
 5876   match(Set dst (AddVF dst src));
 5877   format %{ "addps   $dst,$src\t! add packedF" %}
 5878   ins_encode %{
 5879     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5880   %}
 5881   ins_pipe( pipe_slow );
 5882 %}
 5883 
 5884 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5885   predicate(UseAVX > 0);
 5886   match(Set dst (AddVF src1 src2));
 5887   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5888   ins_encode %{
 5889     int vlen_enc = vector_length_encoding(this);
 5890     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5891   %}
 5892   ins_pipe( pipe_slow );
 5893 %}
 5894 
 5895 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5896   predicate((UseAVX > 0) &&
 5897             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5898   match(Set dst (AddVF src (LoadVector mem)));
 5899   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5900   ins_encode %{
 5901     int vlen_enc = vector_length_encoding(this);
 5902     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5903   %}
 5904   ins_pipe( pipe_slow );
 5905 %}
 5906 
 5907 // Doubles vector add
 5908 instruct vaddD(vec dst, vec src) %{
 5909   predicate(UseAVX == 0);
 5910   match(Set dst (AddVD dst src));
 5911   format %{ "addpd   $dst,$src\t! add packedD" %}
 5912   ins_encode %{
 5913     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5914   %}
 5915   ins_pipe( pipe_slow );
 5916 %}
 5917 
 5918 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5919   predicate(UseAVX > 0);
 5920   match(Set dst (AddVD src1 src2));
 5921   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5922   ins_encode %{
 5923     int vlen_enc = vector_length_encoding(this);
 5924     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5925   %}
 5926   ins_pipe( pipe_slow );
 5927 %}
 5928 
 5929 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5930   predicate((UseAVX > 0) &&
 5931             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5932   match(Set dst (AddVD src (LoadVector mem)));
 5933   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5934   ins_encode %{
 5935     int vlen_enc = vector_length_encoding(this);
 5936     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5937   %}
 5938   ins_pipe( pipe_slow );
 5939 %}
 5940 
 5941 // --------------------------------- SUB --------------------------------------
 5942 
 5943 // Bytes vector sub
 5944 instruct vsubB(vec dst, vec src) %{
 5945   predicate(UseAVX == 0);
 5946   match(Set dst (SubVB dst src));
 5947   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5948   ins_encode %{
 5949     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5950   %}
 5951   ins_pipe( pipe_slow );
 5952 %}
 5953 
 5954 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5955   predicate(UseAVX > 0);
 5956   match(Set dst (SubVB src1 src2));
 5957   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5958   ins_encode %{
 5959     int vlen_enc = vector_length_encoding(this);
 5960     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5961   %}
 5962   ins_pipe( pipe_slow );
 5963 %}
 5964 
 5965 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5966   predicate((UseAVX > 0) &&
 5967             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5968   match(Set dst (SubVB src (LoadVector mem)));
 5969   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5970   ins_encode %{
 5971     int vlen_enc = vector_length_encoding(this);
 5972     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5973   %}
 5974   ins_pipe( pipe_slow );
 5975 %}
 5976 
 5977 // Shorts/Chars vector sub
 5978 instruct vsubS(vec dst, vec src) %{
 5979   predicate(UseAVX == 0);
 5980   match(Set dst (SubVS dst src));
 5981   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5982   ins_encode %{
 5983     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5984   %}
 5985   ins_pipe( pipe_slow );
 5986 %}
 5987 
 5988 
 5989 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5990   predicate(UseAVX > 0);
 5991   match(Set dst (SubVS src1 src2));
 5992   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5993   ins_encode %{
 5994     int vlen_enc = vector_length_encoding(this);
 5995     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5996   %}
 5997   ins_pipe( pipe_slow );
 5998 %}
 5999 
 6000 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 6001   predicate((UseAVX > 0) &&
 6002             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6003   match(Set dst (SubVS src (LoadVector mem)));
 6004   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 6005   ins_encode %{
 6006     int vlen_enc = vector_length_encoding(this);
 6007     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6008   %}
 6009   ins_pipe( pipe_slow );
 6010 %}
 6011 
 6012 // Integers vector sub
 6013 instruct vsubI(vec dst, vec src) %{
 6014   predicate(UseAVX == 0);
 6015   match(Set dst (SubVI dst src));
 6016   format %{ "psubd   $dst,$src\t! sub packedI" %}
 6017   ins_encode %{
 6018     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 6019   %}
 6020   ins_pipe( pipe_slow );
 6021 %}
 6022 
 6023 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 6024   predicate(UseAVX > 0);
 6025   match(Set dst (SubVI src1 src2));
 6026   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 6027   ins_encode %{
 6028     int vlen_enc = vector_length_encoding(this);
 6029     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6030   %}
 6031   ins_pipe( pipe_slow );
 6032 %}
 6033 
 6034 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 6035   predicate((UseAVX > 0) &&
 6036             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6037   match(Set dst (SubVI src (LoadVector mem)));
 6038   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 6039   ins_encode %{
 6040     int vlen_enc = vector_length_encoding(this);
 6041     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6042   %}
 6043   ins_pipe( pipe_slow );
 6044 %}
 6045 
 6046 // Longs vector sub
 6047 instruct vsubL(vec dst, vec src) %{
 6048   predicate(UseAVX == 0);
 6049   match(Set dst (SubVL dst src));
 6050   format %{ "psubq   $dst,$src\t! sub packedL" %}
 6051   ins_encode %{
 6052     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 6053   %}
 6054   ins_pipe( pipe_slow );
 6055 %}
 6056 
 6057 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 6058   predicate(UseAVX > 0);
 6059   match(Set dst (SubVL src1 src2));
 6060   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 6061   ins_encode %{
 6062     int vlen_enc = vector_length_encoding(this);
 6063     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6064   %}
 6065   ins_pipe( pipe_slow );
 6066 %}
 6067 
 6068 
 6069 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 6070   predicate((UseAVX > 0) &&
 6071             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6072   match(Set dst (SubVL src (LoadVector mem)));
 6073   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 6074   ins_encode %{
 6075     int vlen_enc = vector_length_encoding(this);
 6076     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6077   %}
 6078   ins_pipe( pipe_slow );
 6079 %}
 6080 
 6081 // Floats vector sub
 6082 instruct vsubF(vec dst, vec src) %{
 6083   predicate(UseAVX == 0);
 6084   match(Set dst (SubVF dst src));
 6085   format %{ "subps   $dst,$src\t! sub packedF" %}
 6086   ins_encode %{
 6087     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 6088   %}
 6089   ins_pipe( pipe_slow );
 6090 %}
 6091 
 6092 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 6093   predicate(UseAVX > 0);
 6094   match(Set dst (SubVF src1 src2));
 6095   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 6096   ins_encode %{
 6097     int vlen_enc = vector_length_encoding(this);
 6098     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6099   %}
 6100   ins_pipe( pipe_slow );
 6101 %}
 6102 
 6103 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 6104   predicate((UseAVX > 0) &&
 6105             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6106   match(Set dst (SubVF src (LoadVector mem)));
 6107   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 6108   ins_encode %{
 6109     int vlen_enc = vector_length_encoding(this);
 6110     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6111   %}
 6112   ins_pipe( pipe_slow );
 6113 %}
 6114 
 6115 // Doubles vector sub
 6116 instruct vsubD(vec dst, vec src) %{
 6117   predicate(UseAVX == 0);
 6118   match(Set dst (SubVD dst src));
 6119   format %{ "subpd   $dst,$src\t! sub packedD" %}
 6120   ins_encode %{
 6121     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 6122   %}
 6123   ins_pipe( pipe_slow );
 6124 %}
 6125 
 6126 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 6127   predicate(UseAVX > 0);
 6128   match(Set dst (SubVD src1 src2));
 6129   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 6130   ins_encode %{
 6131     int vlen_enc = vector_length_encoding(this);
 6132     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6133   %}
 6134   ins_pipe( pipe_slow );
 6135 %}
 6136 
 6137 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 6138   predicate((UseAVX > 0) &&
 6139             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6140   match(Set dst (SubVD src (LoadVector mem)));
 6141   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6142   ins_encode %{
 6143     int vlen_enc = vector_length_encoding(this);
 6144     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6145   %}
 6146   ins_pipe( pipe_slow );
 6147 %}
 6148 
 6149 // --------------------------------- MUL --------------------------------------
 6150 
 6151 // Byte vector mul
 6152 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6153   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6154   match(Set dst (MulVB src1 src2));
 6155   effect(TEMP dst, TEMP xtmp);
 6156   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6157   ins_encode %{
 6158     assert(UseSSE > 3, "required");
 6159     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6160     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6161     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6162     __ psllw($dst$$XMMRegister, 8);
 6163     __ psrlw($dst$$XMMRegister, 8);
 6164     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6165   %}
 6166   ins_pipe( pipe_slow );
 6167 %}
 6168 
 6169 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6170   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6171   match(Set dst (MulVB src1 src2));
 6172   effect(TEMP dst, TEMP xtmp);
 6173   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6174   ins_encode %{
 6175     assert(UseSSE > 3, "required");
 6176     // Odd-index elements
 6177     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6178     __ psrlw($dst$$XMMRegister, 8);
 6179     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6180     __ psrlw($xtmp$$XMMRegister, 8);
 6181     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6182     __ psllw($dst$$XMMRegister, 8);
 6183     // Even-index elements
 6184     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6185     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6186     __ psllw($xtmp$$XMMRegister, 8);
 6187     __ psrlw($xtmp$$XMMRegister, 8);
 6188     // Combine
 6189     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6190   %}
 6191   ins_pipe( pipe_slow );
 6192 %}
 6193 
 6194 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6195   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6196   match(Set dst (MulVB src1 src2));
 6197   effect(TEMP xtmp1, TEMP xtmp2);
 6198   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6199   ins_encode %{
 6200     int vlen_enc = vector_length_encoding(this);
 6201     // Odd-index elements
 6202     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6203     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6204     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6205     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6206     // Even-index elements
 6207     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6208     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6209     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6210     // Combine
 6211     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6212   %}
 6213   ins_pipe( pipe_slow );
 6214 %}
 6215 
 6216 // Shorts/Chars vector mul
 6217 instruct vmulS(vec dst, vec src) %{
 6218   predicate(UseAVX == 0);
 6219   match(Set dst (MulVS dst src));
 6220   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6221   ins_encode %{
 6222     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6223   %}
 6224   ins_pipe( pipe_slow );
 6225 %}
 6226 
 6227 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6228   predicate(UseAVX > 0);
 6229   match(Set dst (MulVS src1 src2));
 6230   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6231   ins_encode %{
 6232     int vlen_enc = vector_length_encoding(this);
 6233     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6234   %}
 6235   ins_pipe( pipe_slow );
 6236 %}
 6237 
 6238 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6239   predicate((UseAVX > 0) &&
 6240             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6241   match(Set dst (MulVS src (LoadVector mem)));
 6242   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6243   ins_encode %{
 6244     int vlen_enc = vector_length_encoding(this);
 6245     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6246   %}
 6247   ins_pipe( pipe_slow );
 6248 %}
 6249 
 6250 // Integers vector mul
 6251 instruct vmulI(vec dst, vec src) %{
 6252   predicate(UseAVX == 0);
 6253   match(Set dst (MulVI dst src));
 6254   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6255   ins_encode %{
 6256     assert(UseSSE > 3, "required");
 6257     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6258   %}
 6259   ins_pipe( pipe_slow );
 6260 %}
 6261 
 6262 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6263   predicate(UseAVX > 0);
 6264   match(Set dst (MulVI src1 src2));
 6265   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6266   ins_encode %{
 6267     int vlen_enc = vector_length_encoding(this);
 6268     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6269   %}
 6270   ins_pipe( pipe_slow );
 6271 %}
 6272 
 6273 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6274   predicate((UseAVX > 0) &&
 6275             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6276   match(Set dst (MulVI src (LoadVector mem)));
 6277   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6278   ins_encode %{
 6279     int vlen_enc = vector_length_encoding(this);
 6280     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6281   %}
 6282   ins_pipe( pipe_slow );
 6283 %}
 6284 
 6285 // Longs vector mul
 6286 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6287   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6288              VM_Version::supports_avx512dq()) ||
 6289             VM_Version::supports_avx512vldq());
 6290   match(Set dst (MulVL src1 src2));
 6291   ins_cost(500);
 6292   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6293   ins_encode %{
 6294     assert(UseAVX > 2, "required");
 6295     int vlen_enc = vector_length_encoding(this);
 6296     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6297   %}
 6298   ins_pipe( pipe_slow );
 6299 %}
 6300 
 6301 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6302   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6303              VM_Version::supports_avx512dq()) ||
 6304             (Matcher::vector_length_in_bytes(n) > 8 &&
 6305              VM_Version::supports_avx512vldq()));
 6306   match(Set dst (MulVL src (LoadVector mem)));
 6307   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6308   ins_cost(500);
 6309   ins_encode %{
 6310     assert(UseAVX > 2, "required");
 6311     int vlen_enc = vector_length_encoding(this);
 6312     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6313   %}
 6314   ins_pipe( pipe_slow );
 6315 %}
 6316 
 6317 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6318   predicate(UseAVX == 0);
 6319   match(Set dst (MulVL src1 src2));
 6320   ins_cost(500);
 6321   effect(TEMP dst, TEMP xtmp);
 6322   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6323   ins_encode %{
 6324     assert(VM_Version::supports_sse4_1(), "required");
 6325     // Get the lo-hi products, only the lower 32 bits is in concerns
 6326     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6327     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6328     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6329     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6330     __ psllq($dst$$XMMRegister, 32);
 6331     // Get the lo-lo products
 6332     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6333     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6334     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6335   %}
 6336   ins_pipe( pipe_slow );
 6337 %}
 6338 
 6339 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6340   predicate(UseAVX > 0 &&
 6341             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6342               !VM_Version::supports_avx512dq()) ||
 6343              (Matcher::vector_length_in_bytes(n) < 64 &&
 6344               !VM_Version::supports_avx512vldq())));
 6345   match(Set dst (MulVL src1 src2));
 6346   effect(TEMP xtmp1, TEMP xtmp2);
 6347   ins_cost(500);
 6348   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6349   ins_encode %{
 6350     int vlen_enc = vector_length_encoding(this);
 6351     // Get the lo-hi products, only the lower 32 bits is in concerns
 6352     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6353     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6354     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6355     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6356     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6357     // Get the lo-lo products
 6358     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6359     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6360   %}
 6361   ins_pipe( pipe_slow );
 6362 %}
 6363 
 6364 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6365   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6366   match(Set dst (MulVL src1 src2));
 6367   ins_cost(100);
 6368   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6369   ins_encode %{
 6370     int vlen_enc = vector_length_encoding(this);
 6371     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6372   %}
 6373   ins_pipe( pipe_slow );
 6374 %}
 6375 
 6376 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6377   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6378   match(Set dst (MulVL src1 src2));
 6379   ins_cost(100);
 6380   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6381   ins_encode %{
 6382     int vlen_enc = vector_length_encoding(this);
 6383     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6384   %}
 6385   ins_pipe( pipe_slow );
 6386 %}
 6387 
 6388 // Floats vector mul
 6389 instruct vmulF(vec dst, vec src) %{
 6390   predicate(UseAVX == 0);
 6391   match(Set dst (MulVF dst src));
 6392   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6393   ins_encode %{
 6394     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6395   %}
 6396   ins_pipe( pipe_slow );
 6397 %}
 6398 
 6399 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6400   predicate(UseAVX > 0);
 6401   match(Set dst (MulVF src1 src2));
 6402   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6403   ins_encode %{
 6404     int vlen_enc = vector_length_encoding(this);
 6405     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6406   %}
 6407   ins_pipe( pipe_slow );
 6408 %}
 6409 
 6410 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6411   predicate((UseAVX > 0) &&
 6412             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6413   match(Set dst (MulVF src (LoadVector mem)));
 6414   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6415   ins_encode %{
 6416     int vlen_enc = vector_length_encoding(this);
 6417     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6418   %}
 6419   ins_pipe( pipe_slow );
 6420 %}
 6421 
 6422 // Doubles vector mul
 6423 instruct vmulD(vec dst, vec src) %{
 6424   predicate(UseAVX == 0);
 6425   match(Set dst (MulVD dst src));
 6426   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6427   ins_encode %{
 6428     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6429   %}
 6430   ins_pipe( pipe_slow );
 6431 %}
 6432 
 6433 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6434   predicate(UseAVX > 0);
 6435   match(Set dst (MulVD src1 src2));
 6436   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6437   ins_encode %{
 6438     int vlen_enc = vector_length_encoding(this);
 6439     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6440   %}
 6441   ins_pipe( pipe_slow );
 6442 %}
 6443 
 6444 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6445   predicate((UseAVX > 0) &&
 6446             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6447   match(Set dst (MulVD src (LoadVector mem)));
 6448   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6449   ins_encode %{
 6450     int vlen_enc = vector_length_encoding(this);
 6451     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6452   %}
 6453   ins_pipe( pipe_slow );
 6454 %}
 6455 
 6456 // --------------------------------- DIV --------------------------------------
 6457 
 6458 // Floats vector div
 6459 instruct vdivF(vec dst, vec src) %{
 6460   predicate(UseAVX == 0);
 6461   match(Set dst (DivVF dst src));
 6462   format %{ "divps   $dst,$src\t! div packedF" %}
 6463   ins_encode %{
 6464     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6465   %}
 6466   ins_pipe( pipe_slow );
 6467 %}
 6468 
 6469 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6470   predicate(UseAVX > 0);
 6471   match(Set dst (DivVF src1 src2));
 6472   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6473   ins_encode %{
 6474     int vlen_enc = vector_length_encoding(this);
 6475     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6476   %}
 6477   ins_pipe( pipe_slow );
 6478 %}
 6479 
 6480 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6481   predicate((UseAVX > 0) &&
 6482             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6483   match(Set dst (DivVF src (LoadVector mem)));
 6484   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6485   ins_encode %{
 6486     int vlen_enc = vector_length_encoding(this);
 6487     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6488   %}
 6489   ins_pipe( pipe_slow );
 6490 %}
 6491 
 6492 // Doubles vector div
 6493 instruct vdivD(vec dst, vec src) %{
 6494   predicate(UseAVX == 0);
 6495   match(Set dst (DivVD dst src));
 6496   format %{ "divpd   $dst,$src\t! div packedD" %}
 6497   ins_encode %{
 6498     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6499   %}
 6500   ins_pipe( pipe_slow );
 6501 %}
 6502 
 6503 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6504   predicate(UseAVX > 0);
 6505   match(Set dst (DivVD src1 src2));
 6506   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6507   ins_encode %{
 6508     int vlen_enc = vector_length_encoding(this);
 6509     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6510   %}
 6511   ins_pipe( pipe_slow );
 6512 %}
 6513 
 6514 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6515   predicate((UseAVX > 0) &&
 6516             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6517   match(Set dst (DivVD src (LoadVector mem)));
 6518   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6519   ins_encode %{
 6520     int vlen_enc = vector_length_encoding(this);
 6521     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6522   %}
 6523   ins_pipe( pipe_slow );
 6524 %}
 6525 
 6526 // ------------------------------ MinMax ---------------------------------------
 6527 
 6528 // Byte, Short, Int vector Min/Max
 6529 instruct minmax_reg_sse(vec dst, vec src) %{
 6530   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6531             UseAVX == 0);
 6532   match(Set dst (MinV dst src));
 6533   match(Set dst (MaxV dst src));
 6534   format %{ "vector_minmax  $dst,$src\t!  " %}
 6535   ins_encode %{
 6536     assert(UseSSE >= 4, "required");
 6537 
 6538     int opcode = this->ideal_Opcode();
 6539     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6540     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6541   %}
 6542   ins_pipe( pipe_slow );
 6543 %}
 6544 
 6545 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6546   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6547             UseAVX > 0);
 6548   match(Set dst (MinV src1 src2));
 6549   match(Set dst (MaxV src1 src2));
 6550   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6551   ins_encode %{
 6552     int opcode = this->ideal_Opcode();
 6553     int vlen_enc = vector_length_encoding(this);
 6554     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6555 
 6556     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6557   %}
 6558   ins_pipe( pipe_slow );
 6559 %}
 6560 
 6561 // Long vector Min/Max
 6562 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6563   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6564             UseAVX == 0);
 6565   match(Set dst (MinV dst src));
 6566   match(Set dst (MaxV src dst));
 6567   effect(TEMP dst, TEMP tmp);
 6568   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6569   ins_encode %{
 6570     assert(UseSSE >= 4, "required");
 6571 
 6572     int opcode = this->ideal_Opcode();
 6573     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6574     assert(elem_bt == T_LONG, "sanity");
 6575 
 6576     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6577   %}
 6578   ins_pipe( pipe_slow );
 6579 %}
 6580 
 6581 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6582   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6583             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6584   match(Set dst (MinV src1 src2));
 6585   match(Set dst (MaxV src1 src2));
 6586   effect(TEMP dst);
 6587   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6588   ins_encode %{
 6589     int vlen_enc = vector_length_encoding(this);
 6590     int opcode = this->ideal_Opcode();
 6591     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6592     assert(elem_bt == T_LONG, "sanity");
 6593 
 6594     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6595   %}
 6596   ins_pipe( pipe_slow );
 6597 %}
 6598 
 6599 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6600   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6601             Matcher::vector_element_basic_type(n) == T_LONG);
 6602   match(Set dst (MinV src1 src2));
 6603   match(Set dst (MaxV src1 src2));
 6604   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6605   ins_encode %{
 6606     assert(UseAVX > 2, "required");
 6607 
 6608     int vlen_enc = vector_length_encoding(this);
 6609     int opcode = this->ideal_Opcode();
 6610     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6611     assert(elem_bt == T_LONG, "sanity");
 6612 
 6613     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6614   %}
 6615   ins_pipe( pipe_slow );
 6616 %}
 6617 
 6618 // Float/Double vector Min/Max
 6619 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6620   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6621             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6622             UseAVX > 0);
 6623   match(Set dst (MinV a b));
 6624   match(Set dst (MaxV a b));
 6625   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6626   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6627   ins_encode %{
 6628     assert(UseAVX > 0, "required");
 6629 
 6630     int opcode = this->ideal_Opcode();
 6631     int vlen_enc = vector_length_encoding(this);
 6632     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6633 
 6634     __ vminmax_fp(opcode, elem_bt,
 6635                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6636                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6637   %}
 6638   ins_pipe( pipe_slow );
 6639 %}
 6640 
 6641 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6642   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6643             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6644   match(Set dst (MinV a b));
 6645   match(Set dst (MaxV a b));
 6646   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6647   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6648   ins_encode %{
 6649     assert(UseAVX > 2, "required");
 6650 
 6651     int opcode = this->ideal_Opcode();
 6652     int vlen_enc = vector_length_encoding(this);
 6653     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6654 
 6655     __ evminmax_fp(opcode, elem_bt,
 6656                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6657                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6658   %}
 6659   ins_pipe( pipe_slow );
 6660 %}
 6661 
 6662 // ------------------------------ Unsigned vector Min/Max ----------------------
 6663 
 6664 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6665   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6666   match(Set dst (UMinV a b));
 6667   match(Set dst (UMaxV a b));
 6668   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6669   ins_encode %{
 6670     int opcode = this->ideal_Opcode();
 6671     int vlen_enc = vector_length_encoding(this);
 6672     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6673     assert(is_integral_type(elem_bt), "");
 6674     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6675   %}
 6676   ins_pipe( pipe_slow );
 6677 %}
 6678 
 6679 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6680   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6681   match(Set dst (UMinV a (LoadVector b)));
 6682   match(Set dst (UMaxV a (LoadVector b)));
 6683   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6684   ins_encode %{
 6685     int opcode = this->ideal_Opcode();
 6686     int vlen_enc = vector_length_encoding(this);
 6687     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6688     assert(is_integral_type(elem_bt), "");
 6689     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6690   %}
 6691   ins_pipe( pipe_slow );
 6692 %}
 6693 
 6694 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6695   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6696   match(Set dst (UMinV a b));
 6697   match(Set dst (UMaxV a b));
 6698   effect(TEMP xtmp1, TEMP xtmp2);
 6699   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6700   ins_encode %{
 6701     int opcode = this->ideal_Opcode();
 6702     int vlen_enc = vector_length_encoding(this);
 6703     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6704   %}
 6705   ins_pipe( pipe_slow );
 6706 %}
 6707 
 6708 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6709   match(Set dst (UMinV (Binary dst src2) mask));
 6710   match(Set dst (UMaxV (Binary dst src2) mask));
 6711   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6712   ins_encode %{
 6713     int vlen_enc = vector_length_encoding(this);
 6714     BasicType bt = Matcher::vector_element_basic_type(this);
 6715     int opc = this->ideal_Opcode();
 6716     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6717                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6718   %}
 6719   ins_pipe( pipe_slow );
 6720 %}
 6721 
 6722 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6723   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6724   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6725   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6726   ins_encode %{
 6727     int vlen_enc = vector_length_encoding(this);
 6728     BasicType bt = Matcher::vector_element_basic_type(this);
 6729     int opc = this->ideal_Opcode();
 6730     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6731                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6732   %}
 6733   ins_pipe( pipe_slow );
 6734 %}
 6735 
 6736 // --------------------------------- Signum/CopySign ---------------------------
 6737 
 6738 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6739   match(Set dst (SignumF dst (Binary zero one)));
 6740   effect(KILL cr);
 6741   format %{ "signumF $dst, $dst" %}
 6742   ins_encode %{
 6743     int opcode = this->ideal_Opcode();
 6744     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6745   %}
 6746   ins_pipe( pipe_slow );
 6747 %}
 6748 
 6749 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6750   match(Set dst (SignumD dst (Binary zero one)));
 6751   effect(KILL cr);
 6752   format %{ "signumD $dst, $dst" %}
 6753   ins_encode %{
 6754     int opcode = this->ideal_Opcode();
 6755     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6756   %}
 6757   ins_pipe( pipe_slow );
 6758 %}
 6759 
 6760 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6761   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6762   match(Set dst (SignumVF src (Binary zero one)));
 6763   match(Set dst (SignumVD src (Binary zero one)));
 6764   effect(TEMP dst, TEMP xtmp1);
 6765   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6766   ins_encode %{
 6767     int opcode = this->ideal_Opcode();
 6768     int vec_enc = vector_length_encoding(this);
 6769     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6770                          $xtmp1$$XMMRegister, vec_enc);
 6771   %}
 6772   ins_pipe( pipe_slow );
 6773 %}
 6774 
 6775 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6776   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6777   match(Set dst (SignumVF src (Binary zero one)));
 6778   match(Set dst (SignumVD src (Binary zero one)));
 6779   effect(TEMP dst, TEMP ktmp1);
 6780   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6781   ins_encode %{
 6782     int opcode = this->ideal_Opcode();
 6783     int vec_enc = vector_length_encoding(this);
 6784     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6785                           $ktmp1$$KRegister, vec_enc);
 6786   %}
 6787   ins_pipe( pipe_slow );
 6788 %}
 6789 
 6790 // ---------------------------------------
 6791 // For copySign use 0xE4 as writemask for vpternlog
 6792 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6793 // C (xmm2) is set to 0x7FFFFFFF
 6794 // Wherever xmm2 is 0, we want to pick from B (sign)
 6795 // Wherever xmm2 is 1, we want to pick from A (src)
 6796 //
 6797 // A B C Result
 6798 // 0 0 0 0
 6799 // 0 0 1 0
 6800 // 0 1 0 1
 6801 // 0 1 1 0
 6802 // 1 0 0 0
 6803 // 1 0 1 1
 6804 // 1 1 0 1
 6805 // 1 1 1 1
 6806 //
 6807 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6808 // ---------------------------------------
 6809 
 6810 #ifdef _LP64
 6811 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6812   match(Set dst (CopySignF dst src));
 6813   effect(TEMP tmp1, TEMP tmp2);
 6814   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6815   ins_encode %{
 6816     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6817     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6818     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6819   %}
 6820   ins_pipe( pipe_slow );
 6821 %}
 6822 
 6823 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6824   match(Set dst (CopySignD dst (Binary src zero)));
 6825   ins_cost(100);
 6826   effect(TEMP tmp1, TEMP tmp2);
 6827   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6828   ins_encode %{
 6829     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6830     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6831     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6832   %}
 6833   ins_pipe( pipe_slow );
 6834 %}
 6835 
 6836 #endif // _LP64
 6837 
 6838 //----------------------------- CompressBits/ExpandBits ------------------------
 6839 
 6840 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6841   predicate(n->bottom_type()->isa_int());
 6842   match(Set dst (CompressBits src mask));
 6843   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6844   ins_encode %{
 6845     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6846   %}
 6847   ins_pipe( pipe_slow );
 6848 %}
 6849 
 6850 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6851   predicate(n->bottom_type()->isa_int());
 6852   match(Set dst (ExpandBits src mask));
 6853   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6854   ins_encode %{
 6855     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6856   %}
 6857   ins_pipe( pipe_slow );
 6858 %}
 6859 
 6860 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6861   predicate(n->bottom_type()->isa_int());
 6862   match(Set dst (CompressBits src (LoadI mask)));
 6863   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6864   ins_encode %{
 6865     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6866   %}
 6867   ins_pipe( pipe_slow );
 6868 %}
 6869 
 6870 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6871   predicate(n->bottom_type()->isa_int());
 6872   match(Set dst (ExpandBits src (LoadI mask)));
 6873   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6874   ins_encode %{
 6875     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6876   %}
 6877   ins_pipe( pipe_slow );
 6878 %}
 6879 
 6880 // --------------------------------- Sqrt --------------------------------------
 6881 
 6882 instruct vsqrtF_reg(vec dst, vec src) %{
 6883   match(Set dst (SqrtVF src));
 6884   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6885   ins_encode %{
 6886     assert(UseAVX > 0, "required");
 6887     int vlen_enc = vector_length_encoding(this);
 6888     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6889   %}
 6890   ins_pipe( pipe_slow );
 6891 %}
 6892 
 6893 instruct vsqrtF_mem(vec dst, memory mem) %{
 6894   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6895   match(Set dst (SqrtVF (LoadVector mem)));
 6896   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6897   ins_encode %{
 6898     assert(UseAVX > 0, "required");
 6899     int vlen_enc = vector_length_encoding(this);
 6900     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6901   %}
 6902   ins_pipe( pipe_slow );
 6903 %}
 6904 
 6905 // Floating point vector sqrt
 6906 instruct vsqrtD_reg(vec dst, vec src) %{
 6907   match(Set dst (SqrtVD src));
 6908   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6909   ins_encode %{
 6910     assert(UseAVX > 0, "required");
 6911     int vlen_enc = vector_length_encoding(this);
 6912     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6913   %}
 6914   ins_pipe( pipe_slow );
 6915 %}
 6916 
 6917 instruct vsqrtD_mem(vec dst, memory mem) %{
 6918   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6919   match(Set dst (SqrtVD (LoadVector mem)));
 6920   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6921   ins_encode %{
 6922     assert(UseAVX > 0, "required");
 6923     int vlen_enc = vector_length_encoding(this);
 6924     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6925   %}
 6926   ins_pipe( pipe_slow );
 6927 %}
 6928 
 6929 // ------------------------------ Shift ---------------------------------------
 6930 
 6931 // Left and right shift count vectors are the same on x86
 6932 // (only lowest bits of xmm reg are used for count).
 6933 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6934   match(Set dst (LShiftCntV cnt));
 6935   match(Set dst (RShiftCntV cnt));
 6936   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6937   ins_encode %{
 6938     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6939   %}
 6940   ins_pipe( pipe_slow );
 6941 %}
 6942 
 6943 // Byte vector shift
 6944 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6945   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6946   match(Set dst ( LShiftVB src shift));
 6947   match(Set dst ( RShiftVB src shift));
 6948   match(Set dst (URShiftVB src shift));
 6949   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6950   format %{"vector_byte_shift $dst,$src,$shift" %}
 6951   ins_encode %{
 6952     assert(UseSSE > 3, "required");
 6953     int opcode = this->ideal_Opcode();
 6954     bool sign = (opcode != Op_URShiftVB);
 6955     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6956     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6957     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6958     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6959     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6960   %}
 6961   ins_pipe( pipe_slow );
 6962 %}
 6963 
 6964 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6965   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6966             UseAVX <= 1);
 6967   match(Set dst ( LShiftVB src shift));
 6968   match(Set dst ( RShiftVB src shift));
 6969   match(Set dst (URShiftVB src shift));
 6970   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6971   format %{"vector_byte_shift $dst,$src,$shift" %}
 6972   ins_encode %{
 6973     assert(UseSSE > 3, "required");
 6974     int opcode = this->ideal_Opcode();
 6975     bool sign = (opcode != Op_URShiftVB);
 6976     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6977     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6978     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6979     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6980     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6981     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6982     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6983     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6984     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6985   %}
 6986   ins_pipe( pipe_slow );
 6987 %}
 6988 
 6989 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6990   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6991             UseAVX > 1);
 6992   match(Set dst ( LShiftVB src shift));
 6993   match(Set dst ( RShiftVB src shift));
 6994   match(Set dst (URShiftVB src shift));
 6995   effect(TEMP dst, TEMP tmp);
 6996   format %{"vector_byte_shift $dst,$src,$shift" %}
 6997   ins_encode %{
 6998     int opcode = this->ideal_Opcode();
 6999     bool sign = (opcode != Op_URShiftVB);
 7000     int vlen_enc = Assembler::AVX_256bit;
 7001     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7002     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7003     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7004     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 7005     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 7006   %}
 7007   ins_pipe( pipe_slow );
 7008 %}
 7009 
 7010 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 7011   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 7012   match(Set dst ( LShiftVB src shift));
 7013   match(Set dst ( RShiftVB src shift));
 7014   match(Set dst (URShiftVB src shift));
 7015   effect(TEMP dst, TEMP tmp);
 7016   format %{"vector_byte_shift $dst,$src,$shift" %}
 7017   ins_encode %{
 7018     assert(UseAVX > 1, "required");
 7019     int opcode = this->ideal_Opcode();
 7020     bool sign = (opcode != Op_URShiftVB);
 7021     int vlen_enc = Assembler::AVX_256bit;
 7022     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 7023     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7024     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7025     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7026     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7027     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7028     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7029     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7030     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7031   %}
 7032   ins_pipe( pipe_slow );
 7033 %}
 7034 
 7035 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 7036   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 7037   match(Set dst ( LShiftVB src shift));
 7038   match(Set dst  (RShiftVB src shift));
 7039   match(Set dst (URShiftVB src shift));
 7040   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 7041   format %{"vector_byte_shift $dst,$src,$shift" %}
 7042   ins_encode %{
 7043     assert(UseAVX > 2, "required");
 7044     int opcode = this->ideal_Opcode();
 7045     bool sign = (opcode != Op_URShiftVB);
 7046     int vlen_enc = Assembler::AVX_512bit;
 7047     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 7048     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 7049     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7050     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7051     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7052     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 7053     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7054     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7055     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7056     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 7057     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 7058     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7059   %}
 7060   ins_pipe( pipe_slow );
 7061 %}
 7062 
 7063 // Shorts vector logical right shift produces incorrect Java result
 7064 // for negative data because java code convert short value into int with
 7065 // sign extension before a shift. But char vectors are fine since chars are
 7066 // unsigned values.
 7067 // Shorts/Chars vector left shift
 7068 instruct vshiftS(vec dst, vec src, vec shift) %{
 7069   predicate(!n->as_ShiftV()->is_var_shift());
 7070   match(Set dst ( LShiftVS src shift));
 7071   match(Set dst ( RShiftVS src shift));
 7072   match(Set dst (URShiftVS src shift));
 7073   effect(TEMP dst, USE src, USE shift);
 7074   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 7075   ins_encode %{
 7076     int opcode = this->ideal_Opcode();
 7077     if (UseAVX > 0) {
 7078       int vlen_enc = vector_length_encoding(this);
 7079       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7080     } else {
 7081       int vlen = Matcher::vector_length(this);
 7082       if (vlen == 2) {
 7083         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7084         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7085       } else if (vlen == 4) {
 7086         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7087         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7088       } else {
 7089         assert (vlen == 8, "sanity");
 7090         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7091         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7092       }
 7093     }
 7094   %}
 7095   ins_pipe( pipe_slow );
 7096 %}
 7097 
 7098 // Integers vector left shift
 7099 instruct vshiftI(vec dst, vec src, vec shift) %{
 7100   predicate(!n->as_ShiftV()->is_var_shift());
 7101   match(Set dst ( LShiftVI src shift));
 7102   match(Set dst ( RShiftVI src shift));
 7103   match(Set dst (URShiftVI src shift));
 7104   effect(TEMP dst, USE src, USE shift);
 7105   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 7106   ins_encode %{
 7107     int opcode = this->ideal_Opcode();
 7108     if (UseAVX > 0) {
 7109       int vlen_enc = vector_length_encoding(this);
 7110       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7111     } else {
 7112       int vlen = Matcher::vector_length(this);
 7113       if (vlen == 2) {
 7114         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7115         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7116       } else {
 7117         assert(vlen == 4, "sanity");
 7118         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7119         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7120       }
 7121     }
 7122   %}
 7123   ins_pipe( pipe_slow );
 7124 %}
 7125 
 7126 // Integers vector left constant shift
 7127 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 7128   match(Set dst (LShiftVI src (LShiftCntV shift)));
 7129   match(Set dst (RShiftVI src (RShiftCntV shift)));
 7130   match(Set dst (URShiftVI src (RShiftCntV shift)));
 7131   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 7132   ins_encode %{
 7133     int opcode = this->ideal_Opcode();
 7134     if (UseAVX > 0) {
 7135       int vector_len = vector_length_encoding(this);
 7136       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7137     } else {
 7138       int vlen = Matcher::vector_length(this);
 7139       if (vlen == 2) {
 7140         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7141         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7142       } else {
 7143         assert(vlen == 4, "sanity");
 7144         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7145         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7146       }
 7147     }
 7148   %}
 7149   ins_pipe( pipe_slow );
 7150 %}
 7151 
 7152 // Longs vector shift
 7153 instruct vshiftL(vec dst, vec src, vec shift) %{
 7154   predicate(!n->as_ShiftV()->is_var_shift());
 7155   match(Set dst ( LShiftVL src shift));
 7156   match(Set dst (URShiftVL src shift));
 7157   effect(TEMP dst, USE src, USE shift);
 7158   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 7159   ins_encode %{
 7160     int opcode = this->ideal_Opcode();
 7161     if (UseAVX > 0) {
 7162       int vlen_enc = vector_length_encoding(this);
 7163       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7164     } else {
 7165       assert(Matcher::vector_length(this) == 2, "");
 7166       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7167       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7168     }
 7169   %}
 7170   ins_pipe( pipe_slow );
 7171 %}
 7172 
 7173 // Longs vector constant shift
 7174 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 7175   match(Set dst (LShiftVL src (LShiftCntV shift)));
 7176   match(Set dst (URShiftVL src (RShiftCntV shift)));
 7177   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 7178   ins_encode %{
 7179     int opcode = this->ideal_Opcode();
 7180     if (UseAVX > 0) {
 7181       int vector_len = vector_length_encoding(this);
 7182       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7183     } else {
 7184       assert(Matcher::vector_length(this) == 2, "");
 7185       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7186       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7187     }
 7188   %}
 7189   ins_pipe( pipe_slow );
 7190 %}
 7191 
 7192 // -------------------ArithmeticRightShift -----------------------------------
 7193 // Long vector arithmetic right shift
 7194 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 7195   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 7196   match(Set dst (RShiftVL src shift));
 7197   effect(TEMP dst, TEMP tmp);
 7198   format %{ "vshiftq $dst,$src,$shift" %}
 7199   ins_encode %{
 7200     uint vlen = Matcher::vector_length(this);
 7201     if (vlen == 2) {
 7202       assert(UseSSE >= 2, "required");
 7203       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7204       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7205       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7206       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7207       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7208       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7209     } else {
 7210       assert(vlen == 4, "sanity");
 7211       assert(UseAVX > 1, "required");
 7212       int vlen_enc = Assembler::AVX_256bit;
 7213       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7214       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7215       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7216       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7217       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7218     }
 7219   %}
 7220   ins_pipe( pipe_slow );
 7221 %}
 7222 
 7223 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7224   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7225   match(Set dst (RShiftVL src shift));
 7226   format %{ "vshiftq $dst,$src,$shift" %}
 7227   ins_encode %{
 7228     int vlen_enc = vector_length_encoding(this);
 7229     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7230   %}
 7231   ins_pipe( pipe_slow );
 7232 %}
 7233 
 7234 // ------------------- Variable Shift -----------------------------
 7235 // Byte variable shift
 7236 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7237   predicate(Matcher::vector_length(n) <= 8 &&
 7238             n->as_ShiftV()->is_var_shift() &&
 7239             !VM_Version::supports_avx512bw());
 7240   match(Set dst ( LShiftVB src shift));
 7241   match(Set dst ( RShiftVB src shift));
 7242   match(Set dst (URShiftVB src shift));
 7243   effect(TEMP dst, TEMP vtmp);
 7244   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7245   ins_encode %{
 7246     assert(UseAVX >= 2, "required");
 7247 
 7248     int opcode = this->ideal_Opcode();
 7249     int vlen_enc = Assembler::AVX_128bit;
 7250     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7251     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7252   %}
 7253   ins_pipe( pipe_slow );
 7254 %}
 7255 
 7256 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7257   predicate(Matcher::vector_length(n) == 16 &&
 7258             n->as_ShiftV()->is_var_shift() &&
 7259             !VM_Version::supports_avx512bw());
 7260   match(Set dst ( LShiftVB src shift));
 7261   match(Set dst ( RShiftVB src shift));
 7262   match(Set dst (URShiftVB src shift));
 7263   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7264   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7265   ins_encode %{
 7266     assert(UseAVX >= 2, "required");
 7267 
 7268     int opcode = this->ideal_Opcode();
 7269     int vlen_enc = Assembler::AVX_128bit;
 7270     // Shift lower half and get word result in dst
 7271     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7272 
 7273     // Shift upper half and get word result in vtmp1
 7274     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7275     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7276     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7277 
 7278     // Merge and down convert the two word results to byte in dst
 7279     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7280   %}
 7281   ins_pipe( pipe_slow );
 7282 %}
 7283 
 7284 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7285   predicate(Matcher::vector_length(n) == 32 &&
 7286             n->as_ShiftV()->is_var_shift() &&
 7287             !VM_Version::supports_avx512bw());
 7288   match(Set dst ( LShiftVB src shift));
 7289   match(Set dst ( RShiftVB src shift));
 7290   match(Set dst (URShiftVB src shift));
 7291   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7292   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7293   ins_encode %{
 7294     assert(UseAVX >= 2, "required");
 7295 
 7296     int opcode = this->ideal_Opcode();
 7297     int vlen_enc = Assembler::AVX_128bit;
 7298     // Process lower 128 bits and get result in dst
 7299     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7300     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7301     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7302     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7303     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7304 
 7305     // Process higher 128 bits and get result in vtmp3
 7306     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7307     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7308     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7309     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7310     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7311     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7312     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7313 
 7314     // Merge the two results in dst
 7315     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7316   %}
 7317   ins_pipe( pipe_slow );
 7318 %}
 7319 
 7320 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7321   predicate(Matcher::vector_length(n) <= 32 &&
 7322             n->as_ShiftV()->is_var_shift() &&
 7323             VM_Version::supports_avx512bw());
 7324   match(Set dst ( LShiftVB src shift));
 7325   match(Set dst ( RShiftVB src shift));
 7326   match(Set dst (URShiftVB src shift));
 7327   effect(TEMP dst, TEMP vtmp);
 7328   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7329   ins_encode %{
 7330     assert(UseAVX > 2, "required");
 7331 
 7332     int opcode = this->ideal_Opcode();
 7333     int vlen_enc = vector_length_encoding(this);
 7334     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7335   %}
 7336   ins_pipe( pipe_slow );
 7337 %}
 7338 
 7339 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7340   predicate(Matcher::vector_length(n) == 64 &&
 7341             n->as_ShiftV()->is_var_shift() &&
 7342             VM_Version::supports_avx512bw());
 7343   match(Set dst ( LShiftVB src shift));
 7344   match(Set dst ( RShiftVB src shift));
 7345   match(Set dst (URShiftVB src shift));
 7346   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7347   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7348   ins_encode %{
 7349     assert(UseAVX > 2, "required");
 7350 
 7351     int opcode = this->ideal_Opcode();
 7352     int vlen_enc = Assembler::AVX_256bit;
 7353     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7354     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7355     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7356     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7357     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7358   %}
 7359   ins_pipe( pipe_slow );
 7360 %}
 7361 
 7362 // Short variable shift
 7363 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7364   predicate(Matcher::vector_length(n) <= 8 &&
 7365             n->as_ShiftV()->is_var_shift() &&
 7366             !VM_Version::supports_avx512bw());
 7367   match(Set dst ( LShiftVS src shift));
 7368   match(Set dst ( RShiftVS src shift));
 7369   match(Set dst (URShiftVS src shift));
 7370   effect(TEMP dst, TEMP vtmp);
 7371   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7372   ins_encode %{
 7373     assert(UseAVX >= 2, "required");
 7374 
 7375     int opcode = this->ideal_Opcode();
 7376     bool sign = (opcode != Op_URShiftVS);
 7377     int vlen_enc = Assembler::AVX_256bit;
 7378     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7379     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7380     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7381     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7382     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7383     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7384   %}
 7385   ins_pipe( pipe_slow );
 7386 %}
 7387 
 7388 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7389   predicate(Matcher::vector_length(n) == 16 &&
 7390             n->as_ShiftV()->is_var_shift() &&
 7391             !VM_Version::supports_avx512bw());
 7392   match(Set dst ( LShiftVS src shift));
 7393   match(Set dst ( RShiftVS src shift));
 7394   match(Set dst (URShiftVS src shift));
 7395   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7396   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7397   ins_encode %{
 7398     assert(UseAVX >= 2, "required");
 7399 
 7400     int opcode = this->ideal_Opcode();
 7401     bool sign = (opcode != Op_URShiftVS);
 7402     int vlen_enc = Assembler::AVX_256bit;
 7403     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7404     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7405     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7406     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7407     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7408 
 7409     // Shift upper half, with result in dst using vtmp1 as TEMP
 7410     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7411     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7412     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7413     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7414     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7415     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7416 
 7417     // Merge lower and upper half result into dst
 7418     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7419     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7420   %}
 7421   ins_pipe( pipe_slow );
 7422 %}
 7423 
 7424 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7425   predicate(n->as_ShiftV()->is_var_shift() &&
 7426             VM_Version::supports_avx512bw());
 7427   match(Set dst ( LShiftVS src shift));
 7428   match(Set dst ( RShiftVS src shift));
 7429   match(Set dst (URShiftVS src shift));
 7430   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7431   ins_encode %{
 7432     assert(UseAVX > 2, "required");
 7433 
 7434     int opcode = this->ideal_Opcode();
 7435     int vlen_enc = vector_length_encoding(this);
 7436     if (!VM_Version::supports_avx512vl()) {
 7437       vlen_enc = Assembler::AVX_512bit;
 7438     }
 7439     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7440   %}
 7441   ins_pipe( pipe_slow );
 7442 %}
 7443 
 7444 //Integer variable shift
 7445 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7446   predicate(n->as_ShiftV()->is_var_shift());
 7447   match(Set dst ( LShiftVI src shift));
 7448   match(Set dst ( RShiftVI src shift));
 7449   match(Set dst (URShiftVI src shift));
 7450   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7451   ins_encode %{
 7452     assert(UseAVX >= 2, "required");
 7453 
 7454     int opcode = this->ideal_Opcode();
 7455     int vlen_enc = vector_length_encoding(this);
 7456     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7457   %}
 7458   ins_pipe( pipe_slow );
 7459 %}
 7460 
 7461 //Long variable shift
 7462 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7463   predicate(n->as_ShiftV()->is_var_shift());
 7464   match(Set dst ( LShiftVL src shift));
 7465   match(Set dst (URShiftVL src shift));
 7466   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7467   ins_encode %{
 7468     assert(UseAVX >= 2, "required");
 7469 
 7470     int opcode = this->ideal_Opcode();
 7471     int vlen_enc = vector_length_encoding(this);
 7472     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7473   %}
 7474   ins_pipe( pipe_slow );
 7475 %}
 7476 
 7477 //Long variable right shift arithmetic
 7478 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7479   predicate(Matcher::vector_length(n) <= 4 &&
 7480             n->as_ShiftV()->is_var_shift() &&
 7481             UseAVX == 2);
 7482   match(Set dst (RShiftVL src shift));
 7483   effect(TEMP dst, TEMP vtmp);
 7484   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7485   ins_encode %{
 7486     int opcode = this->ideal_Opcode();
 7487     int vlen_enc = vector_length_encoding(this);
 7488     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7489                  $vtmp$$XMMRegister);
 7490   %}
 7491   ins_pipe( pipe_slow );
 7492 %}
 7493 
 7494 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7495   predicate(n->as_ShiftV()->is_var_shift() &&
 7496             UseAVX > 2);
 7497   match(Set dst (RShiftVL src shift));
 7498   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7499   ins_encode %{
 7500     int opcode = this->ideal_Opcode();
 7501     int vlen_enc = vector_length_encoding(this);
 7502     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7503   %}
 7504   ins_pipe( pipe_slow );
 7505 %}
 7506 
 7507 // --------------------------------- AND --------------------------------------
 7508 
 7509 instruct vand(vec dst, vec src) %{
 7510   predicate(UseAVX == 0);
 7511   match(Set dst (AndV dst src));
 7512   format %{ "pand    $dst,$src\t! and vectors" %}
 7513   ins_encode %{
 7514     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7515   %}
 7516   ins_pipe( pipe_slow );
 7517 %}
 7518 
 7519 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7520   predicate(UseAVX > 0);
 7521   match(Set dst (AndV src1 src2));
 7522   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7523   ins_encode %{
 7524     int vlen_enc = vector_length_encoding(this);
 7525     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7526   %}
 7527   ins_pipe( pipe_slow );
 7528 %}
 7529 
 7530 instruct vand_mem(vec dst, vec src, memory mem) %{
 7531   predicate((UseAVX > 0) &&
 7532             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7533   match(Set dst (AndV src (LoadVector mem)));
 7534   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7535   ins_encode %{
 7536     int vlen_enc = vector_length_encoding(this);
 7537     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7538   %}
 7539   ins_pipe( pipe_slow );
 7540 %}
 7541 
 7542 // --------------------------------- OR ---------------------------------------
 7543 
 7544 instruct vor(vec dst, vec src) %{
 7545   predicate(UseAVX == 0);
 7546   match(Set dst (OrV dst src));
 7547   format %{ "por     $dst,$src\t! or vectors" %}
 7548   ins_encode %{
 7549     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7550   %}
 7551   ins_pipe( pipe_slow );
 7552 %}
 7553 
 7554 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7555   predicate(UseAVX > 0);
 7556   match(Set dst (OrV src1 src2));
 7557   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7558   ins_encode %{
 7559     int vlen_enc = vector_length_encoding(this);
 7560     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7561   %}
 7562   ins_pipe( pipe_slow );
 7563 %}
 7564 
 7565 instruct vor_mem(vec dst, vec src, memory mem) %{
 7566   predicate((UseAVX > 0) &&
 7567             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7568   match(Set dst (OrV src (LoadVector mem)));
 7569   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7570   ins_encode %{
 7571     int vlen_enc = vector_length_encoding(this);
 7572     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7573   %}
 7574   ins_pipe( pipe_slow );
 7575 %}
 7576 
 7577 // --------------------------------- XOR --------------------------------------
 7578 
 7579 instruct vxor(vec dst, vec src) %{
 7580   predicate(UseAVX == 0);
 7581   match(Set dst (XorV dst src));
 7582   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7583   ins_encode %{
 7584     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7585   %}
 7586   ins_pipe( pipe_slow );
 7587 %}
 7588 
 7589 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7590   predicate(UseAVX > 0);
 7591   match(Set dst (XorV src1 src2));
 7592   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7593   ins_encode %{
 7594     int vlen_enc = vector_length_encoding(this);
 7595     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7596   %}
 7597   ins_pipe( pipe_slow );
 7598 %}
 7599 
 7600 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7601   predicate((UseAVX > 0) &&
 7602             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7603   match(Set dst (XorV src (LoadVector mem)));
 7604   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7605   ins_encode %{
 7606     int vlen_enc = vector_length_encoding(this);
 7607     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7608   %}
 7609   ins_pipe( pipe_slow );
 7610 %}
 7611 
 7612 // --------------------------------- VectorCast --------------------------------------
 7613 
 7614 instruct vcastBtoX(vec dst, vec src) %{
 7615   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7616   match(Set dst (VectorCastB2X src));
 7617   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7618   ins_encode %{
 7619     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7620     int vlen_enc = vector_length_encoding(this);
 7621     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7622   %}
 7623   ins_pipe( pipe_slow );
 7624 %}
 7625 
 7626 instruct vcastBtoD(legVec dst, legVec src) %{
 7627   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7628   match(Set dst (VectorCastB2X src));
 7629   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7630   ins_encode %{
 7631     int vlen_enc = vector_length_encoding(this);
 7632     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7633   %}
 7634   ins_pipe( pipe_slow );
 7635 %}
 7636 
 7637 instruct castStoX(vec dst, vec src) %{
 7638   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7639             Matcher::vector_length(n->in(1)) <= 8 && // src
 7640             Matcher::vector_element_basic_type(n) == T_BYTE);
 7641   match(Set dst (VectorCastS2X src));
 7642   format %{ "vector_cast_s2x $dst,$src" %}
 7643   ins_encode %{
 7644     assert(UseAVX > 0, "required");
 7645 
 7646     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7647     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7648   %}
 7649   ins_pipe( pipe_slow );
 7650 %}
 7651 
 7652 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7653   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7654             Matcher::vector_length(n->in(1)) == 16 && // src
 7655             Matcher::vector_element_basic_type(n) == T_BYTE);
 7656   effect(TEMP dst, TEMP vtmp);
 7657   match(Set dst (VectorCastS2X src));
 7658   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7659   ins_encode %{
 7660     assert(UseAVX > 0, "required");
 7661 
 7662     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7663     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7664     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7665     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7666   %}
 7667   ins_pipe( pipe_slow );
 7668 %}
 7669 
 7670 instruct vcastStoX_evex(vec dst, vec src) %{
 7671   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7672             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7673   match(Set dst (VectorCastS2X src));
 7674   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7675   ins_encode %{
 7676     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7677     int src_vlen_enc = vector_length_encoding(this, $src);
 7678     int vlen_enc = vector_length_encoding(this);
 7679     switch (to_elem_bt) {
 7680       case T_BYTE:
 7681         if (!VM_Version::supports_avx512vl()) {
 7682           vlen_enc = Assembler::AVX_512bit;
 7683         }
 7684         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7685         break;
 7686       case T_INT:
 7687         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7688         break;
 7689       case T_FLOAT:
 7690         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7691         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7692         break;
 7693       case T_LONG:
 7694         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7695         break;
 7696       case T_DOUBLE: {
 7697         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7698         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7699         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7700         break;
 7701       }
 7702       default:
 7703         ShouldNotReachHere();
 7704     }
 7705   %}
 7706   ins_pipe( pipe_slow );
 7707 %}
 7708 
 7709 instruct castItoX(vec dst, vec src) %{
 7710   predicate(UseAVX <= 2 &&
 7711             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7712             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7713   match(Set dst (VectorCastI2X src));
 7714   format %{ "vector_cast_i2x $dst,$src" %}
 7715   ins_encode %{
 7716     assert(UseAVX > 0, "required");
 7717 
 7718     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7719     int vlen_enc = vector_length_encoding(this, $src);
 7720 
 7721     if (to_elem_bt == T_BYTE) {
 7722       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7723       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7724       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7725     } else {
 7726       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7727       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7728       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7729     }
 7730   %}
 7731   ins_pipe( pipe_slow );
 7732 %}
 7733 
 7734 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7735   predicate(UseAVX <= 2 &&
 7736             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7737             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7738   match(Set dst (VectorCastI2X src));
 7739   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7740   effect(TEMP dst, TEMP vtmp);
 7741   ins_encode %{
 7742     assert(UseAVX > 0, "required");
 7743 
 7744     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7745     int vlen_enc = vector_length_encoding(this, $src);
 7746 
 7747     if (to_elem_bt == T_BYTE) {
 7748       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7749       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7750       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7751       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7752     } else {
 7753       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7754       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7755       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7756       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7757     }
 7758   %}
 7759   ins_pipe( pipe_slow );
 7760 %}
 7761 
 7762 instruct vcastItoX_evex(vec dst, vec src) %{
 7763   predicate(UseAVX > 2 ||
 7764             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7765   match(Set dst (VectorCastI2X src));
 7766   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7767   ins_encode %{
 7768     assert(UseAVX > 0, "required");
 7769 
 7770     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7771     int src_vlen_enc = vector_length_encoding(this, $src);
 7772     int dst_vlen_enc = vector_length_encoding(this);
 7773     switch (dst_elem_bt) {
 7774       case T_BYTE:
 7775         if (!VM_Version::supports_avx512vl()) {
 7776           src_vlen_enc = Assembler::AVX_512bit;
 7777         }
 7778         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7779         break;
 7780       case T_SHORT:
 7781         if (!VM_Version::supports_avx512vl()) {
 7782           src_vlen_enc = Assembler::AVX_512bit;
 7783         }
 7784         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7785         break;
 7786       case T_FLOAT:
 7787         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7788         break;
 7789       case T_LONG:
 7790         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7791         break;
 7792       case T_DOUBLE:
 7793         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7794         break;
 7795       default:
 7796         ShouldNotReachHere();
 7797     }
 7798   %}
 7799   ins_pipe( pipe_slow );
 7800 %}
 7801 
 7802 instruct vcastLtoBS(vec dst, vec src) %{
 7803   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7804             UseAVX <= 2);
 7805   match(Set dst (VectorCastL2X src));
 7806   format %{ "vector_cast_l2x  $dst,$src" %}
 7807   ins_encode %{
 7808     assert(UseAVX > 0, "required");
 7809 
 7810     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7811     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7812     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7813                                                       : ExternalAddress(vector_int_to_short_mask());
 7814     if (vlen <= 16) {
 7815       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7816       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7817       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7818     } else {
 7819       assert(vlen <= 32, "required");
 7820       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7821       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7822       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7823       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7824     }
 7825     if (to_elem_bt == T_BYTE) {
 7826       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7827     }
 7828   %}
 7829   ins_pipe( pipe_slow );
 7830 %}
 7831 
 7832 instruct vcastLtoX_evex(vec dst, vec src) %{
 7833   predicate(UseAVX > 2 ||
 7834             (Matcher::vector_element_basic_type(n) == T_INT ||
 7835              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7836              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7837   match(Set dst (VectorCastL2X src));
 7838   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7839   ins_encode %{
 7840     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7841     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7842     int vlen_enc = vector_length_encoding(this, $src);
 7843     switch (to_elem_bt) {
 7844       case T_BYTE:
 7845         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7846           vlen_enc = Assembler::AVX_512bit;
 7847         }
 7848         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7849         break;
 7850       case T_SHORT:
 7851         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7852           vlen_enc = Assembler::AVX_512bit;
 7853         }
 7854         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7855         break;
 7856       case T_INT:
 7857         if (vlen == 8) {
 7858           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7859             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7860           }
 7861         } else if (vlen == 16) {
 7862           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7863         } else if (vlen == 32) {
 7864           if (UseAVX > 2) {
 7865             if (!VM_Version::supports_avx512vl()) {
 7866               vlen_enc = Assembler::AVX_512bit;
 7867             }
 7868             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7869           } else {
 7870             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7871             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7872           }
 7873         } else { // vlen == 64
 7874           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7875         }
 7876         break;
 7877       case T_FLOAT:
 7878         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7879         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7880         break;
 7881       case T_DOUBLE:
 7882         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7883         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7884         break;
 7885 
 7886       default: assert(false, "%s", type2name(to_elem_bt));
 7887     }
 7888   %}
 7889   ins_pipe( pipe_slow );
 7890 %}
 7891 
 7892 instruct vcastFtoD_reg(vec dst, vec src) %{
 7893   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7894   match(Set dst (VectorCastF2X src));
 7895   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7896   ins_encode %{
 7897     int vlen_enc = vector_length_encoding(this);
 7898     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7899   %}
 7900   ins_pipe( pipe_slow );
 7901 %}
 7902 
 7903 
 7904 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7905   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7906             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7907   match(Set dst (VectorCastF2X src));
 7908   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7909   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7910   ins_encode %{
 7911     int vlen_enc = vector_length_encoding(this, $src);
 7912     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7913     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7914     // 32 bit addresses for register indirect addressing mode since stub constants
 7915     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7916     // However, targets are free to increase this limit, but having a large code cache size
 7917     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7918     // cap we save a temporary register allocation which in limiting case can prevent
 7919     // spilling in high register pressure blocks.
 7920     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7921                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7922                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7923   %}
 7924   ins_pipe( pipe_slow );
 7925 %}
 7926 
 7927 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7928   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7929             is_integral_type(Matcher::vector_element_basic_type(n)));
 7930   match(Set dst (VectorCastF2X src));
 7931   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7932   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7933   ins_encode %{
 7934     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7935     if (to_elem_bt == T_LONG) {
 7936       int vlen_enc = vector_length_encoding(this);
 7937       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7938                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7939                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7940     } else {
 7941       int vlen_enc = vector_length_encoding(this, $src);
 7942       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7943                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7944                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7945     }
 7946   %}
 7947   ins_pipe( pipe_slow );
 7948 %}
 7949 
 7950 instruct vcastDtoF_reg(vec dst, vec src) %{
 7951   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7952   match(Set dst (VectorCastD2X src));
 7953   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7954   ins_encode %{
 7955     int vlen_enc = vector_length_encoding(this, $src);
 7956     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7957   %}
 7958   ins_pipe( pipe_slow );
 7959 %}
 7960 
 7961 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7962   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7963             is_integral_type(Matcher::vector_element_basic_type(n)));
 7964   match(Set dst (VectorCastD2X src));
 7965   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7966   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7967   ins_encode %{
 7968     int vlen_enc = vector_length_encoding(this, $src);
 7969     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7970     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7971                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7972                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7973   %}
 7974   ins_pipe( pipe_slow );
 7975 %}
 7976 
 7977 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7978   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7979             is_integral_type(Matcher::vector_element_basic_type(n)));
 7980   match(Set dst (VectorCastD2X src));
 7981   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7982   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7983   ins_encode %{
 7984     int vlen_enc = vector_length_encoding(this, $src);
 7985     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7986     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7987                               ExternalAddress(vector_float_signflip());
 7988     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7989                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7990   %}
 7991   ins_pipe( pipe_slow );
 7992 %}
 7993 
 7994 instruct vucast(vec dst, vec src) %{
 7995   match(Set dst (VectorUCastB2X src));
 7996   match(Set dst (VectorUCastS2X src));
 7997   match(Set dst (VectorUCastI2X src));
 7998   format %{ "vector_ucast $dst,$src\t!" %}
 7999   ins_encode %{
 8000     assert(UseAVX > 0, "required");
 8001 
 8002     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 8003     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 8004     int vlen_enc = vector_length_encoding(this);
 8005     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 8006   %}
 8007   ins_pipe( pipe_slow );
 8008 %}
 8009 
 8010 #ifdef _LP64
 8011 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 8012   predicate(!VM_Version::supports_avx512vl() &&
 8013             Matcher::vector_length_in_bytes(n) < 64 &&
 8014             Matcher::vector_element_basic_type(n) == T_INT);
 8015   match(Set dst (RoundVF src));
 8016   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 8017   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 8018   ins_encode %{
 8019     int vlen_enc = vector_length_encoding(this);
 8020     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 8021     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 8022                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 8023                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 8024   %}
 8025   ins_pipe( pipe_slow );
 8026 %}
 8027 
 8028 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 8029   predicate((VM_Version::supports_avx512vl() ||
 8030              Matcher::vector_length_in_bytes(n) == 64) &&
 8031              Matcher::vector_element_basic_type(n) == T_INT);
 8032   match(Set dst (RoundVF src));
 8033   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 8034   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 8035   ins_encode %{
 8036     int vlen_enc = vector_length_encoding(this);
 8037     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 8038     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 8039                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 8040                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 8041   %}
 8042   ins_pipe( pipe_slow );
 8043 %}
 8044 
 8045 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 8046   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 8047   match(Set dst (RoundVD src));
 8048   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 8049   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 8050   ins_encode %{
 8051     int vlen_enc = vector_length_encoding(this);
 8052     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 8053     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 8054                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 8055                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 8056   %}
 8057   ins_pipe( pipe_slow );
 8058 %}
 8059 
 8060 #endif // _LP64
 8061 
 8062 // --------------------------------- VectorMaskCmp --------------------------------------
 8063 
 8064 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 8065   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8066             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 8067             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8068             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8069   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8070   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 8071   ins_encode %{
 8072     int vlen_enc = vector_length_encoding(this, $src1);
 8073     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8074     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8075       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8076     } else {
 8077       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8078     }
 8079   %}
 8080   ins_pipe( pipe_slow );
 8081 %}
 8082 
 8083 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8084   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 8085             n->bottom_type()->isa_vectmask() == nullptr &&
 8086             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8087   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8088   effect(TEMP ktmp);
 8089   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8090   ins_encode %{
 8091     int vlen_enc = Assembler::AVX_512bit;
 8092     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8093     KRegister mask = k0; // The comparison itself is not being masked.
 8094     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8095       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8096       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 8097     } else {
 8098       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8099       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 8100     }
 8101   %}
 8102   ins_pipe( pipe_slow );
 8103 %}
 8104 
 8105 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 8106   predicate(n->bottom_type()->isa_vectmask() &&
 8107             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8108   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8109   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 8110   ins_encode %{
 8111     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8112     int vlen_enc = vector_length_encoding(this, $src1);
 8113     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8114     KRegister mask = k0; // The comparison itself is not being masked.
 8115     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8116       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8117     } else {
 8118       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8119     }
 8120   %}
 8121   ins_pipe( pipe_slow );
 8122 %}
 8123 
 8124 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 8125   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8126             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8127             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8128             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8129             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8130             (n->in(2)->get_int() == BoolTest::eq ||
 8131              n->in(2)->get_int() == BoolTest::lt ||
 8132              n->in(2)->get_int() == BoolTest::gt)); // cond
 8133   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8134   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 8135   ins_encode %{
 8136     int vlen_enc = vector_length_encoding(this, $src1);
 8137     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8138     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8139     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 8140   %}
 8141   ins_pipe( pipe_slow );
 8142 %}
 8143 
 8144 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8145   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8146             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8147             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8148             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8149             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8150             (n->in(2)->get_int() == BoolTest::ne ||
 8151              n->in(2)->get_int() == BoolTest::le ||
 8152              n->in(2)->get_int() == BoolTest::ge)); // cond
 8153   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8154   effect(TEMP dst, TEMP xtmp);
 8155   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8156   ins_encode %{
 8157     int vlen_enc = vector_length_encoding(this, $src1);
 8158     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8159     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8160     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8161   %}
 8162   ins_pipe( pipe_slow );
 8163 %}
 8164 
 8165 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8166   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8167             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8168             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8169             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8170             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8171   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8172   effect(TEMP dst, TEMP xtmp);
 8173   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8174   ins_encode %{
 8175     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 8176     int vlen_enc = vector_length_encoding(this, $src1);
 8177     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8178     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8179 
 8180     if (vlen_enc == Assembler::AVX_128bit) {
 8181       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8182     } else {
 8183       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8184     }
 8185     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8186     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8187     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8188   %}
 8189   ins_pipe( pipe_slow );
 8190 %}
 8191 
 8192 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8193   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8194              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8195              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8196   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8197   effect(TEMP ktmp);
 8198   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8199   ins_encode %{
 8200     assert(UseAVX > 2, "required");
 8201 
 8202     int vlen_enc = vector_length_encoding(this, $src1);
 8203     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8204     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8205     KRegister mask = k0; // The comparison itself is not being masked.
 8206     bool merge = false;
 8207     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8208 
 8209     switch (src1_elem_bt) {
 8210       case T_INT: {
 8211         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8212         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8213         break;
 8214       }
 8215       case T_LONG: {
 8216         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8217         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8218         break;
 8219       }
 8220       default: assert(false, "%s", type2name(src1_elem_bt));
 8221     }
 8222   %}
 8223   ins_pipe( pipe_slow );
 8224 %}
 8225 
 8226 
 8227 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8228   predicate(n->bottom_type()->isa_vectmask() &&
 8229             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8230   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8231   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8232   ins_encode %{
 8233     assert(UseAVX > 2, "required");
 8234     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8235 
 8236     int vlen_enc = vector_length_encoding(this, $src1);
 8237     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8238     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8239     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8240 
 8241     // Comparison i
 8242     switch (src1_elem_bt) {
 8243       case T_BYTE: {
 8244         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8245         break;
 8246       }
 8247       case T_SHORT: {
 8248         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8249         break;
 8250       }
 8251       case T_INT: {
 8252         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8253         break;
 8254       }
 8255       case T_LONG: {
 8256         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8257         break;
 8258       }
 8259       default: assert(false, "%s", type2name(src1_elem_bt));
 8260     }
 8261   %}
 8262   ins_pipe( pipe_slow );
 8263 %}
 8264 
 8265 // Extract
 8266 
 8267 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8268   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8269   match(Set dst (ExtractI src idx));
 8270   match(Set dst (ExtractS src idx));
 8271 #ifdef _LP64
 8272   match(Set dst (ExtractB src idx));
 8273 #endif
 8274   format %{ "extractI $dst,$src,$idx\t!" %}
 8275   ins_encode %{
 8276     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8277 
 8278     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8279     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8280   %}
 8281   ins_pipe( pipe_slow );
 8282 %}
 8283 
 8284 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8285   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8286             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8287   match(Set dst (ExtractI src idx));
 8288   match(Set dst (ExtractS src idx));
 8289 #ifdef _LP64
 8290   match(Set dst (ExtractB src idx));
 8291 #endif
 8292   effect(TEMP vtmp);
 8293   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8294   ins_encode %{
 8295     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8296 
 8297     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8298     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8299     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8300   %}
 8301   ins_pipe( pipe_slow );
 8302 %}
 8303 
 8304 #ifdef _LP64
 8305 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8306   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8307   match(Set dst (ExtractL src idx));
 8308   format %{ "extractL $dst,$src,$idx\t!" %}
 8309   ins_encode %{
 8310     assert(UseSSE >= 4, "required");
 8311     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8312 
 8313     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8314   %}
 8315   ins_pipe( pipe_slow );
 8316 %}
 8317 
 8318 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8319   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8320             Matcher::vector_length(n->in(1)) == 8);  // src
 8321   match(Set dst (ExtractL src idx));
 8322   effect(TEMP vtmp);
 8323   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8324   ins_encode %{
 8325     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8326 
 8327     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8328     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8329   %}
 8330   ins_pipe( pipe_slow );
 8331 %}
 8332 #endif
 8333 
 8334 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8335   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8336   match(Set dst (ExtractF src idx));
 8337   effect(TEMP dst, TEMP vtmp);
 8338   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8339   ins_encode %{
 8340     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8341 
 8342     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8343   %}
 8344   ins_pipe( pipe_slow );
 8345 %}
 8346 
 8347 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8348   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8349             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8350   match(Set dst (ExtractF src idx));
 8351   effect(TEMP vtmp);
 8352   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8353   ins_encode %{
 8354     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8355 
 8356     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8357     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8358   %}
 8359   ins_pipe( pipe_slow );
 8360 %}
 8361 
 8362 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8363   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8364   match(Set dst (ExtractD src idx));
 8365   format %{ "extractD $dst,$src,$idx\t!" %}
 8366   ins_encode %{
 8367     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8368 
 8369     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8370   %}
 8371   ins_pipe( pipe_slow );
 8372 %}
 8373 
 8374 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8375   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8376             Matcher::vector_length(n->in(1)) == 8);  // src
 8377   match(Set dst (ExtractD src idx));
 8378   effect(TEMP vtmp);
 8379   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8380   ins_encode %{
 8381     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8382 
 8383     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8384     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8385   %}
 8386   ins_pipe( pipe_slow );
 8387 %}
 8388 
 8389 // --------------------------------- Vector Blend --------------------------------------
 8390 
 8391 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8392   predicate(UseAVX == 0);
 8393   match(Set dst (VectorBlend (Binary dst src) mask));
 8394   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8395   effect(TEMP tmp);
 8396   ins_encode %{
 8397     assert(UseSSE >= 4, "required");
 8398 
 8399     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8400       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8401     }
 8402     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8403   %}
 8404   ins_pipe( pipe_slow );
 8405 %}
 8406 
 8407 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8408   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8409             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8410             Matcher::vector_length_in_bytes(n) <= 32 &&
 8411             is_integral_type(Matcher::vector_element_basic_type(n)));
 8412   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8413   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8414   ins_encode %{
 8415     int vlen_enc = vector_length_encoding(this);
 8416     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8417   %}
 8418   ins_pipe( pipe_slow );
 8419 %}
 8420 
 8421 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8422   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8423             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8424             Matcher::vector_length_in_bytes(n) <= 32 &&
 8425             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8426   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8427   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8428   ins_encode %{
 8429     int vlen_enc = vector_length_encoding(this);
 8430     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8431   %}
 8432   ins_pipe( pipe_slow );
 8433 %}
 8434 
 8435 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8436   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8437             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8438             Matcher::vector_length_in_bytes(n) <= 32);
 8439   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8440   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8441   effect(TEMP vtmp, TEMP dst);
 8442   ins_encode %{
 8443     int vlen_enc = vector_length_encoding(this);
 8444     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8445     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8446     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8447   %}
 8448   ins_pipe( pipe_slow );
 8449 %}
 8450 
 8451 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8452   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8453             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8454   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8455   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8456   effect(TEMP ktmp);
 8457   ins_encode %{
 8458      int vlen_enc = Assembler::AVX_512bit;
 8459      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8460     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8461     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8462   %}
 8463   ins_pipe( pipe_slow );
 8464 %}
 8465 
 8466 
 8467 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8468   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8469             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8470              VM_Version::supports_avx512bw()));
 8471   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8472   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8473   ins_encode %{
 8474     int vlen_enc = vector_length_encoding(this);
 8475     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8476     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8477   %}
 8478   ins_pipe( pipe_slow );
 8479 %}
 8480 
 8481 // --------------------------------- ABS --------------------------------------
 8482 // a = |a|
 8483 instruct vabsB_reg(vec dst, vec src) %{
 8484   match(Set dst (AbsVB  src));
 8485   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8486   ins_encode %{
 8487     uint vlen = Matcher::vector_length(this);
 8488     if (vlen <= 16) {
 8489       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8490     } else {
 8491       int vlen_enc = vector_length_encoding(this);
 8492       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8493     }
 8494   %}
 8495   ins_pipe( pipe_slow );
 8496 %}
 8497 
 8498 instruct vabsS_reg(vec dst, vec src) %{
 8499   match(Set dst (AbsVS  src));
 8500   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8501   ins_encode %{
 8502     uint vlen = Matcher::vector_length(this);
 8503     if (vlen <= 8) {
 8504       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8505     } else {
 8506       int vlen_enc = vector_length_encoding(this);
 8507       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8508     }
 8509   %}
 8510   ins_pipe( pipe_slow );
 8511 %}
 8512 
 8513 instruct vabsI_reg(vec dst, vec src) %{
 8514   match(Set dst (AbsVI  src));
 8515   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8516   ins_encode %{
 8517     uint vlen = Matcher::vector_length(this);
 8518     if (vlen <= 4) {
 8519       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8520     } else {
 8521       int vlen_enc = vector_length_encoding(this);
 8522       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8523     }
 8524   %}
 8525   ins_pipe( pipe_slow );
 8526 %}
 8527 
 8528 instruct vabsL_reg(vec dst, vec src) %{
 8529   match(Set dst (AbsVL  src));
 8530   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8531   ins_encode %{
 8532     assert(UseAVX > 2, "required");
 8533     int vlen_enc = vector_length_encoding(this);
 8534     if (!VM_Version::supports_avx512vl()) {
 8535       vlen_enc = Assembler::AVX_512bit;
 8536     }
 8537     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8538   %}
 8539   ins_pipe( pipe_slow );
 8540 %}
 8541 
 8542 // --------------------------------- ABSNEG --------------------------------------
 8543 
 8544 instruct vabsnegF(vec dst, vec src) %{
 8545   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8546   match(Set dst (AbsVF src));
 8547   match(Set dst (NegVF src));
 8548   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8549   ins_cost(150);
 8550   ins_encode %{
 8551     int opcode = this->ideal_Opcode();
 8552     int vlen = Matcher::vector_length(this);
 8553     if (vlen == 2) {
 8554       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8555     } else {
 8556       assert(vlen == 8 || vlen == 16, "required");
 8557       int vlen_enc = vector_length_encoding(this);
 8558       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8559     }
 8560   %}
 8561   ins_pipe( pipe_slow );
 8562 %}
 8563 
 8564 instruct vabsneg4F(vec dst) %{
 8565   predicate(Matcher::vector_length(n) == 4);
 8566   match(Set dst (AbsVF dst));
 8567   match(Set dst (NegVF dst));
 8568   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8569   ins_cost(150);
 8570   ins_encode %{
 8571     int opcode = this->ideal_Opcode();
 8572     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8573   %}
 8574   ins_pipe( pipe_slow );
 8575 %}
 8576 
 8577 instruct vabsnegD(vec dst, vec src) %{
 8578   match(Set dst (AbsVD  src));
 8579   match(Set dst (NegVD  src));
 8580   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8581   ins_encode %{
 8582     int opcode = this->ideal_Opcode();
 8583     uint vlen = Matcher::vector_length(this);
 8584     if (vlen == 2) {
 8585       assert(UseSSE >= 2, "required");
 8586       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8587     } else {
 8588       int vlen_enc = vector_length_encoding(this);
 8589       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8590     }
 8591   %}
 8592   ins_pipe( pipe_slow );
 8593 %}
 8594 
 8595 //------------------------------------- VectorTest --------------------------------------------
 8596 
 8597 #ifdef _LP64
 8598 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8599   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8600   match(Set cr (VectorTest src1 src2));
 8601   effect(TEMP vtmp);
 8602   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8603   ins_encode %{
 8604     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8605     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8606     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8607   %}
 8608   ins_pipe( pipe_slow );
 8609 %}
 8610 
 8611 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8612   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8613   match(Set cr (VectorTest src1 src2));
 8614   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8615   ins_encode %{
 8616     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8617     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8618     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8619   %}
 8620   ins_pipe( pipe_slow );
 8621 %}
 8622 
 8623 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8624   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8625              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8626             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8627   match(Set cr (VectorTest src1 src2));
 8628   effect(TEMP tmp);
 8629   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8630   ins_encode %{
 8631     uint masklen = Matcher::vector_length(this, $src1);
 8632     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8633     __ andl($tmp$$Register, (1 << masklen) - 1);
 8634     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8635   %}
 8636   ins_pipe( pipe_slow );
 8637 %}
 8638 
 8639 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8640   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8641              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8642             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8643   match(Set cr (VectorTest src1 src2));
 8644   effect(TEMP tmp);
 8645   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8646   ins_encode %{
 8647     uint masklen = Matcher::vector_length(this, $src1);
 8648     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8649     __ andl($tmp$$Register, (1 << masklen) - 1);
 8650   %}
 8651   ins_pipe( pipe_slow );
 8652 %}
 8653 
 8654 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8655   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8656             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8657   match(Set cr (VectorTest src1 src2));
 8658   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8659   ins_encode %{
 8660     uint masklen = Matcher::vector_length(this, $src1);
 8661     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8662   %}
 8663   ins_pipe( pipe_slow );
 8664 %}
 8665 #endif
 8666 
 8667 //------------------------------------- LoadMask --------------------------------------------
 8668 
 8669 instruct loadMask(legVec dst, legVec src) %{
 8670   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8671   match(Set dst (VectorLoadMask src));
 8672   effect(TEMP dst);
 8673   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8674   ins_encode %{
 8675     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8676     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8677     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8678   %}
 8679   ins_pipe( pipe_slow );
 8680 %}
 8681 
 8682 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8683   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8684   match(Set dst (VectorLoadMask src));
 8685   effect(TEMP xtmp);
 8686   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8687   ins_encode %{
 8688     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8689                         true, Assembler::AVX_512bit);
 8690   %}
 8691   ins_pipe( pipe_slow );
 8692 %}
 8693 
 8694 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8695   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8696   match(Set dst (VectorLoadMask src));
 8697   effect(TEMP xtmp);
 8698   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8699   ins_encode %{
 8700     int vlen_enc = vector_length_encoding(in(1));
 8701     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8702                         false, vlen_enc);
 8703   %}
 8704   ins_pipe( pipe_slow );
 8705 %}
 8706 
 8707 //------------------------------------- StoreMask --------------------------------------------
 8708 
 8709 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8710   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8711   match(Set dst (VectorStoreMask src size));
 8712   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8713   ins_encode %{
 8714     int vlen = Matcher::vector_length(this);
 8715     if (vlen <= 16 && UseAVX <= 2) {
 8716       assert(UseSSE >= 3, "required");
 8717       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8718     } else {
 8719       assert(UseAVX > 0, "required");
 8720       int src_vlen_enc = vector_length_encoding(this, $src);
 8721       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8722     }
 8723   %}
 8724   ins_pipe( pipe_slow );
 8725 %}
 8726 
 8727 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8728   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8729   match(Set dst (VectorStoreMask src size));
 8730   effect(TEMP_DEF dst, TEMP xtmp);
 8731   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8732   ins_encode %{
 8733     int vlen_enc = Assembler::AVX_128bit;
 8734     int vlen = Matcher::vector_length(this);
 8735     if (vlen <= 8) {
 8736       assert(UseSSE >= 3, "required");
 8737       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8738       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8739       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8740     } else {
 8741       assert(UseAVX > 0, "required");
 8742       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8743       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8744       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8745     }
 8746   %}
 8747   ins_pipe( pipe_slow );
 8748 %}
 8749 
 8750 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8751   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8752   match(Set dst (VectorStoreMask src size));
 8753   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8754   effect(TEMP_DEF dst, TEMP xtmp);
 8755   ins_encode %{
 8756     int vlen_enc = Assembler::AVX_128bit;
 8757     int vlen = Matcher::vector_length(this);
 8758     if (vlen <= 4) {
 8759       assert(UseSSE >= 3, "required");
 8760       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8761       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8762       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8763       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8764     } else {
 8765       assert(UseAVX > 0, "required");
 8766       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8767       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8768       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8769       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8770       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8771     }
 8772   %}
 8773   ins_pipe( pipe_slow );
 8774 %}
 8775 
 8776 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8777   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8778   match(Set dst (VectorStoreMask src size));
 8779   effect(TEMP_DEF dst, TEMP xtmp);
 8780   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8781   ins_encode %{
 8782     assert(UseSSE >= 3, "required");
 8783     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8784     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8785     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8786     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8787     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8788   %}
 8789   ins_pipe( pipe_slow );
 8790 %}
 8791 
 8792 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8793   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8794   match(Set dst (VectorStoreMask src size));
 8795   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8796   effect(TEMP_DEF dst, TEMP vtmp);
 8797   ins_encode %{
 8798     int vlen_enc = Assembler::AVX_128bit;
 8799     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8800     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8801     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8802     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8803     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8804     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8805     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8806   %}
 8807   ins_pipe( pipe_slow );
 8808 %}
 8809 
 8810 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8811   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8812   match(Set dst (VectorStoreMask src size));
 8813   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8814   ins_encode %{
 8815     int src_vlen_enc = vector_length_encoding(this, $src);
 8816     int dst_vlen_enc = vector_length_encoding(this);
 8817     if (!VM_Version::supports_avx512vl()) {
 8818       src_vlen_enc = Assembler::AVX_512bit;
 8819     }
 8820     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8821     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8822   %}
 8823   ins_pipe( pipe_slow );
 8824 %}
 8825 
 8826 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8827   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8828   match(Set dst (VectorStoreMask src size));
 8829   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8830   ins_encode %{
 8831     int src_vlen_enc = vector_length_encoding(this, $src);
 8832     int dst_vlen_enc = vector_length_encoding(this);
 8833     if (!VM_Version::supports_avx512vl()) {
 8834       src_vlen_enc = Assembler::AVX_512bit;
 8835     }
 8836     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8837     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8838   %}
 8839   ins_pipe( pipe_slow );
 8840 %}
 8841 
 8842 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8843   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8844   match(Set dst (VectorStoreMask mask size));
 8845   effect(TEMP_DEF dst);
 8846   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8847   ins_encode %{
 8848     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8849     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8850                  false, Assembler::AVX_512bit, noreg);
 8851     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8852   %}
 8853   ins_pipe( pipe_slow );
 8854 %}
 8855 
 8856 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8857   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8858   match(Set dst (VectorStoreMask mask size));
 8859   effect(TEMP_DEF dst);
 8860   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8861   ins_encode %{
 8862     int dst_vlen_enc = vector_length_encoding(this);
 8863     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8864     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8865   %}
 8866   ins_pipe( pipe_slow );
 8867 %}
 8868 
 8869 instruct vmaskcast_evex(kReg dst) %{
 8870   match(Set dst (VectorMaskCast dst));
 8871   ins_cost(0);
 8872   format %{ "vector_mask_cast $dst" %}
 8873   ins_encode %{
 8874     // empty
 8875   %}
 8876   ins_pipe(empty);
 8877 %}
 8878 
 8879 instruct vmaskcast(vec dst) %{
 8880   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8881   match(Set dst (VectorMaskCast dst));
 8882   ins_cost(0);
 8883   format %{ "vector_mask_cast $dst" %}
 8884   ins_encode %{
 8885     // empty
 8886   %}
 8887   ins_pipe(empty);
 8888 %}
 8889 
 8890 instruct vmaskcast_avx(vec dst, vec src) %{
 8891   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8892   match(Set dst (VectorMaskCast src));
 8893   format %{ "vector_mask_cast $dst, $src" %}
 8894   ins_encode %{
 8895     int vlen = Matcher::vector_length(this);
 8896     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8897     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8898     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8899   %}
 8900   ins_pipe(pipe_slow);
 8901 %}
 8902 
 8903 //-------------------------------- Load Iota Indices ----------------------------------
 8904 
 8905 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8906   match(Set dst (VectorLoadConst src));
 8907   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8908   ins_encode %{
 8909      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8910      BasicType bt = Matcher::vector_element_basic_type(this);
 8911      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8912   %}
 8913   ins_pipe( pipe_slow );
 8914 %}
 8915 
 8916 #ifdef _LP64
 8917 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8918   match(Set dst (PopulateIndex src1 src2));
 8919   effect(TEMP dst, TEMP vtmp);
 8920   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8921   ins_encode %{
 8922      assert($src2$$constant == 1, "required");
 8923      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8924      int vlen_enc = vector_length_encoding(this);
 8925      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8926      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8927      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8928      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8929   %}
 8930   ins_pipe( pipe_slow );
 8931 %}
 8932 
 8933 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8934   match(Set dst (PopulateIndex src1 src2));
 8935   effect(TEMP dst, TEMP vtmp);
 8936   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8937   ins_encode %{
 8938      assert($src2$$constant == 1, "required");
 8939      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8940      int vlen_enc = vector_length_encoding(this);
 8941      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8942      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8943      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8944      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8945   %}
 8946   ins_pipe( pipe_slow );
 8947 %}
 8948 #endif
 8949 //-------------------------------- Rearrange ----------------------------------
 8950 
 8951 // LoadShuffle/Rearrange for Byte
 8952 instruct rearrangeB(vec dst, vec shuffle) %{
 8953   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8954             Matcher::vector_length(n) < 32);
 8955   match(Set dst (VectorRearrange dst shuffle));
 8956   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8957   ins_encode %{
 8958     assert(UseSSE >= 4, "required");
 8959     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8960   %}
 8961   ins_pipe( pipe_slow );
 8962 %}
 8963 
 8964 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8965   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8966             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8967   match(Set dst (VectorRearrange src shuffle));
 8968   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8969   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8970   ins_encode %{
 8971     assert(UseAVX >= 2, "required");
 8972     // Swap src into vtmp1
 8973     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8974     // Shuffle swapped src to get entries from other 128 bit lane
 8975     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8976     // Shuffle original src to get entries from self 128 bit lane
 8977     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8978     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8979     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8980     // Perform the blend
 8981     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8982   %}
 8983   ins_pipe( pipe_slow );
 8984 %}
 8985 
 8986 
 8987 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8988   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8989             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8990   match(Set dst (VectorRearrange src shuffle));
 8991   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8992   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8993   ins_encode %{
 8994     int vlen_enc = vector_length_encoding(this);
 8995     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8996                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8997                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8998   %}
 8999   ins_pipe( pipe_slow );
 9000 %}
 9001 
 9002 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 9003   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 9004             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 9005   match(Set dst (VectorRearrange src shuffle));
 9006   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9007   ins_encode %{
 9008     int vlen_enc = vector_length_encoding(this);
 9009     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9010   %}
 9011   ins_pipe( pipe_slow );
 9012 %}
 9013 
 9014 // LoadShuffle/Rearrange for Short
 9015 
 9016 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 9017   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9018             !VM_Version::supports_avx512bw());
 9019   match(Set dst (VectorLoadShuffle src));
 9020   effect(TEMP dst, TEMP vtmp);
 9021   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9022   ins_encode %{
 9023     // Create a byte shuffle mask from short shuffle mask
 9024     // only byte shuffle instruction available on these platforms
 9025     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 9026     if (UseAVX == 0) {
 9027       assert(vlen_in_bytes <= 16, "required");
 9028       // Multiply each shuffle by two to get byte index
 9029       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 9030       __ psllw($vtmp$$XMMRegister, 1);
 9031 
 9032       // Duplicate to create 2 copies of byte index
 9033       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 9034       __ psllw($dst$$XMMRegister, 8);
 9035       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 9036 
 9037       // Add one to get alternate byte index
 9038       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 9039       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 9040     } else {
 9041       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 9042       int vlen_enc = vector_length_encoding(this);
 9043       // Multiply each shuffle by two to get byte index
 9044       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 9045 
 9046       // Duplicate to create 2 copies of byte index
 9047       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 9048       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9049 
 9050       // Add one to get alternate byte index
 9051       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 9052     }
 9053   %}
 9054   ins_pipe( pipe_slow );
 9055 %}
 9056 
 9057 instruct rearrangeS(vec dst, vec shuffle) %{
 9058   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9059             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 9060   match(Set dst (VectorRearrange dst shuffle));
 9061   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9062   ins_encode %{
 9063     assert(UseSSE >= 4, "required");
 9064     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9065   %}
 9066   ins_pipe( pipe_slow );
 9067 %}
 9068 
 9069 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 9070   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9071             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 9072   match(Set dst (VectorRearrange src shuffle));
 9073   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 9074   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 9075   ins_encode %{
 9076     assert(UseAVX >= 2, "required");
 9077     // Swap src into vtmp1
 9078     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 9079     // Shuffle swapped src to get entries from other 128 bit lane
 9080     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 9081     // Shuffle original src to get entries from self 128 bit lane
 9082     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 9083     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 9084     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 9085     // Perform the blend
 9086     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 9087   %}
 9088   ins_pipe( pipe_slow );
 9089 %}
 9090 
 9091 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 9092   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9093             VM_Version::supports_avx512bw());
 9094   match(Set dst (VectorRearrange src shuffle));
 9095   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9096   ins_encode %{
 9097     int vlen_enc = vector_length_encoding(this);
 9098     if (!VM_Version::supports_avx512vl()) {
 9099       vlen_enc = Assembler::AVX_512bit;
 9100     }
 9101     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9102   %}
 9103   ins_pipe( pipe_slow );
 9104 %}
 9105 
 9106 // LoadShuffle/Rearrange for Integer and Float
 9107 
 9108 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 9109   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9110             Matcher::vector_length(n) == 4 && UseAVX == 0);
 9111   match(Set dst (VectorLoadShuffle src));
 9112   effect(TEMP dst, TEMP vtmp);
 9113   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9114   ins_encode %{
 9115     assert(UseSSE >= 4, "required");
 9116 
 9117     // Create a byte shuffle mask from int shuffle mask
 9118     // only byte shuffle instruction available on these platforms
 9119 
 9120     // Duplicate and multiply each shuffle by 4
 9121     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 9122     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9123     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9124     __ psllw($vtmp$$XMMRegister, 2);
 9125 
 9126     // Duplicate again to create 4 copies of byte index
 9127     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 9128     __ psllw($dst$$XMMRegister, 8);
 9129     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 9130 
 9131     // Add 3,2,1,0 to get alternate byte index
 9132     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 9133     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 9134   %}
 9135   ins_pipe( pipe_slow );
 9136 %}
 9137 
 9138 instruct rearrangeI(vec dst, vec shuffle) %{
 9139   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9140             UseAVX == 0);
 9141   match(Set dst (VectorRearrange dst shuffle));
 9142   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9143   ins_encode %{
 9144     assert(UseSSE >= 4, "required");
 9145     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9146   %}
 9147   ins_pipe( pipe_slow );
 9148 %}
 9149 
 9150 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 9151   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9152             UseAVX > 0);
 9153   match(Set dst (VectorRearrange src shuffle));
 9154   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9155   ins_encode %{
 9156     int vlen_enc = vector_length_encoding(this);
 9157     BasicType bt = Matcher::vector_element_basic_type(this);
 9158     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9159   %}
 9160   ins_pipe( pipe_slow );
 9161 %}
 9162 
 9163 // LoadShuffle/Rearrange for Long and Double
 9164 
 9165 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 9166   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9167             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9168   match(Set dst (VectorLoadShuffle src));
 9169   effect(TEMP dst, TEMP vtmp);
 9170   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9171   ins_encode %{
 9172     assert(UseAVX >= 2, "required");
 9173 
 9174     int vlen_enc = vector_length_encoding(this);
 9175     // Create a double word shuffle mask from long shuffle mask
 9176     // only double word shuffle instruction available on these platforms
 9177 
 9178     // Multiply each shuffle by two to get double word index
 9179     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 9180 
 9181     // Duplicate each double word shuffle
 9182     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9183     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9184 
 9185     // Add one to get alternate double word index
 9186     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9187   %}
 9188   ins_pipe( pipe_slow );
 9189 %}
 9190 
 9191 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9192   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9193             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9194   match(Set dst (VectorRearrange src shuffle));
 9195   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9196   ins_encode %{
 9197     assert(UseAVX >= 2, "required");
 9198 
 9199     int vlen_enc = vector_length_encoding(this);
 9200     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9201   %}
 9202   ins_pipe( pipe_slow );
 9203 %}
 9204 
 9205 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9206   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9207             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9208   match(Set dst (VectorRearrange src shuffle));
 9209   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9210   ins_encode %{
 9211     assert(UseAVX > 2, "required");
 9212 
 9213     int vlen_enc = vector_length_encoding(this);
 9214     if (vlen_enc == Assembler::AVX_128bit) {
 9215       vlen_enc = Assembler::AVX_256bit;
 9216     }
 9217     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9218   %}
 9219   ins_pipe( pipe_slow );
 9220 %}
 9221 
 9222 // --------------------------------- FMA --------------------------------------
 9223 // a * b + c
 9224 
 9225 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9226   match(Set c (FmaVF  c (Binary a b)));
 9227   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9228   ins_cost(150);
 9229   ins_encode %{
 9230     assert(UseFMA, "not enabled");
 9231     int vlen_enc = vector_length_encoding(this);
 9232     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9233   %}
 9234   ins_pipe( pipe_slow );
 9235 %}
 9236 
 9237 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9238   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9239   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9240   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9241   ins_cost(150);
 9242   ins_encode %{
 9243     assert(UseFMA, "not enabled");
 9244     int vlen_enc = vector_length_encoding(this);
 9245     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9246   %}
 9247   ins_pipe( pipe_slow );
 9248 %}
 9249 
 9250 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9251   match(Set c (FmaVD  c (Binary a b)));
 9252   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9253   ins_cost(150);
 9254   ins_encode %{
 9255     assert(UseFMA, "not enabled");
 9256     int vlen_enc = vector_length_encoding(this);
 9257     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9258   %}
 9259   ins_pipe( pipe_slow );
 9260 %}
 9261 
 9262 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9263   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9264   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9265   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9266   ins_cost(150);
 9267   ins_encode %{
 9268     assert(UseFMA, "not enabled");
 9269     int vlen_enc = vector_length_encoding(this);
 9270     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9271   %}
 9272   ins_pipe( pipe_slow );
 9273 %}
 9274 
 9275 // --------------------------------- Vector Multiply Add --------------------------------------
 9276 
 9277 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9278   predicate(UseAVX == 0);
 9279   match(Set dst (MulAddVS2VI dst src1));
 9280   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9281   ins_encode %{
 9282     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9283   %}
 9284   ins_pipe( pipe_slow );
 9285 %}
 9286 
 9287 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9288   predicate(UseAVX > 0);
 9289   match(Set dst (MulAddVS2VI src1 src2));
 9290   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9291   ins_encode %{
 9292     int vlen_enc = vector_length_encoding(this);
 9293     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9294   %}
 9295   ins_pipe( pipe_slow );
 9296 %}
 9297 
 9298 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9299 
 9300 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9301   predicate(VM_Version::supports_avx512_vnni());
 9302   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9303   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9304   ins_encode %{
 9305     assert(UseAVX > 2, "required");
 9306     int vlen_enc = vector_length_encoding(this);
 9307     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9308   %}
 9309   ins_pipe( pipe_slow );
 9310   ins_cost(10);
 9311 %}
 9312 
 9313 // --------------------------------- PopCount --------------------------------------
 9314 
 9315 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9316   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9317   match(Set dst (PopCountVI src));
 9318   match(Set dst (PopCountVL src));
 9319   format %{ "vector_popcount_integral $dst, $src" %}
 9320   ins_encode %{
 9321     int opcode = this->ideal_Opcode();
 9322     int vlen_enc = vector_length_encoding(this, $src);
 9323     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9324     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9325   %}
 9326   ins_pipe( pipe_slow );
 9327 %}
 9328 
 9329 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9330   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9331   match(Set dst (PopCountVI src mask));
 9332   match(Set dst (PopCountVL src mask));
 9333   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9334   ins_encode %{
 9335     int vlen_enc = vector_length_encoding(this, $src);
 9336     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9337     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9338     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9339   %}
 9340   ins_pipe( pipe_slow );
 9341 %}
 9342 
 9343 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9344   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9345   match(Set dst (PopCountVI src));
 9346   match(Set dst (PopCountVL src));
 9347   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9348   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9349   ins_encode %{
 9350     int opcode = this->ideal_Opcode();
 9351     int vlen_enc = vector_length_encoding(this, $src);
 9352     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9353     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9354                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9355   %}
 9356   ins_pipe( pipe_slow );
 9357 %}
 9358 
 9359 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9360 
 9361 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9362   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9363                                               Matcher::vector_length_in_bytes(n->in(1))));
 9364   match(Set dst (CountTrailingZerosV src));
 9365   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9366   ins_cost(400);
 9367   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9368   ins_encode %{
 9369     int vlen_enc = vector_length_encoding(this, $src);
 9370     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9371     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9372                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9373   %}
 9374   ins_pipe( pipe_slow );
 9375 %}
 9376 
 9377 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9378   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9379             VM_Version::supports_avx512cd() &&
 9380             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9381   match(Set dst (CountTrailingZerosV src));
 9382   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9383   ins_cost(400);
 9384   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9385   ins_encode %{
 9386     int vlen_enc = vector_length_encoding(this, $src);
 9387     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9388     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9389                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9390   %}
 9391   ins_pipe( pipe_slow );
 9392 %}
 9393 
 9394 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9395   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9396   match(Set dst (CountTrailingZerosV src));
 9397   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9398   ins_cost(400);
 9399   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9400   ins_encode %{
 9401     int vlen_enc = vector_length_encoding(this, $src);
 9402     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9403     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9404                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9405                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9406   %}
 9407   ins_pipe( pipe_slow );
 9408 %}
 9409 
 9410 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9411   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9412   match(Set dst (CountTrailingZerosV src));
 9413   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9414   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9415   ins_encode %{
 9416     int vlen_enc = vector_length_encoding(this, $src);
 9417     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9418     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9419                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9420   %}
 9421   ins_pipe( pipe_slow );
 9422 %}
 9423 
 9424 
 9425 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9426 
 9427 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9428   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9429   effect(TEMP dst);
 9430   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9431   ins_encode %{
 9432     int vector_len = vector_length_encoding(this);
 9433     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9434   %}
 9435   ins_pipe( pipe_slow );
 9436 %}
 9437 
 9438 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9439   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9440   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9441   effect(TEMP dst);
 9442   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9443   ins_encode %{
 9444     int vector_len = vector_length_encoding(this);
 9445     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9446   %}
 9447   ins_pipe( pipe_slow );
 9448 %}
 9449 
 9450 // --------------------------------- Rotation Operations ----------------------------------
 9451 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9452   match(Set dst (RotateLeftV src shift));
 9453   match(Set dst (RotateRightV src shift));
 9454   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9455   ins_encode %{
 9456     int opcode      = this->ideal_Opcode();
 9457     int vector_len  = vector_length_encoding(this);
 9458     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9459     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9460   %}
 9461   ins_pipe( pipe_slow );
 9462 %}
 9463 
 9464 instruct vprorate(vec dst, vec src, vec shift) %{
 9465   match(Set dst (RotateLeftV src shift));
 9466   match(Set dst (RotateRightV src shift));
 9467   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9468   ins_encode %{
 9469     int opcode      = this->ideal_Opcode();
 9470     int vector_len  = vector_length_encoding(this);
 9471     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9472     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9473   %}
 9474   ins_pipe( pipe_slow );
 9475 %}
 9476 
 9477 // ---------------------------------- Masked Operations ------------------------------------
 9478 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9479   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9480   match(Set dst (LoadVectorMasked mem mask));
 9481   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9482   ins_encode %{
 9483     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9484     int vlen_enc = vector_length_encoding(this);
 9485     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9486   %}
 9487   ins_pipe( pipe_slow );
 9488 %}
 9489 
 9490 
 9491 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9492   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9493   match(Set dst (LoadVectorMasked mem mask));
 9494   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9495   ins_encode %{
 9496     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9497     int vector_len = vector_length_encoding(this);
 9498     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9499   %}
 9500   ins_pipe( pipe_slow );
 9501 %}
 9502 
 9503 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9504   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9505   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9506   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9507   ins_encode %{
 9508     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9509     int vlen_enc = vector_length_encoding(src_node);
 9510     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9511     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9512   %}
 9513   ins_pipe( pipe_slow );
 9514 %}
 9515 
 9516 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9517   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9518   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9519   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9520   ins_encode %{
 9521     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9522     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9523     int vlen_enc = vector_length_encoding(src_node);
 9524     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9525   %}
 9526   ins_pipe( pipe_slow );
 9527 %}
 9528 
 9529 #ifdef _LP64
 9530 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9531   match(Set addr (VerifyVectorAlignment addr mask));
 9532   effect(KILL cr);
 9533   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9534   ins_encode %{
 9535     Label Lskip;
 9536     // check if masked bits of addr are zero
 9537     __ testq($addr$$Register, $mask$$constant);
 9538     __ jccb(Assembler::equal, Lskip);
 9539     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9540     __ bind(Lskip);
 9541   %}
 9542   ins_pipe(pipe_slow);
 9543 %}
 9544 
 9545 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9546   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9547   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9548   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9549   ins_encode %{
 9550     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9551     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9552 
 9553     Label DONE;
 9554     int vlen_enc = vector_length_encoding(this, $src1);
 9555     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9556 
 9557     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9558     __ mov64($dst$$Register, -1L);
 9559     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9560     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9561     __ jccb(Assembler::carrySet, DONE);
 9562     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9563     __ notq($dst$$Register);
 9564     __ tzcntq($dst$$Register, $dst$$Register);
 9565     __ bind(DONE);
 9566   %}
 9567   ins_pipe( pipe_slow );
 9568 %}
 9569 
 9570 
 9571 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9572   match(Set dst (VectorMaskGen len));
 9573   effect(TEMP temp, KILL cr);
 9574   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9575   ins_encode %{
 9576     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9577   %}
 9578   ins_pipe( pipe_slow );
 9579 %}
 9580 
 9581 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9582   match(Set dst (VectorMaskGen len));
 9583   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9584   effect(TEMP temp);
 9585   ins_encode %{
 9586     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9587     __ kmovql($dst$$KRegister, $temp$$Register);
 9588   %}
 9589   ins_pipe( pipe_slow );
 9590 %}
 9591 
 9592 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9593   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9594   match(Set dst (VectorMaskToLong mask));
 9595   effect(TEMP dst, KILL cr);
 9596   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9597   ins_encode %{
 9598     int opcode = this->ideal_Opcode();
 9599     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9600     int mask_len = Matcher::vector_length(this, $mask);
 9601     int mask_size = mask_len * type2aelembytes(mbt);
 9602     int vlen_enc = vector_length_encoding(this, $mask);
 9603     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9604                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9605   %}
 9606   ins_pipe( pipe_slow );
 9607 %}
 9608 
 9609 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9610   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9611   match(Set dst (VectorMaskToLong mask));
 9612   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9613   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9614   ins_encode %{
 9615     int opcode = this->ideal_Opcode();
 9616     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9617     int mask_len = Matcher::vector_length(this, $mask);
 9618     int vlen_enc = vector_length_encoding(this, $mask);
 9619     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9620                              $dst$$Register, mask_len, mbt, vlen_enc);
 9621   %}
 9622   ins_pipe( pipe_slow );
 9623 %}
 9624 
 9625 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9626   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9627   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9628   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9629   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9630   ins_encode %{
 9631     int opcode = this->ideal_Opcode();
 9632     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9633     int mask_len = Matcher::vector_length(this, $mask);
 9634     int vlen_enc = vector_length_encoding(this, $mask);
 9635     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9636                              $dst$$Register, mask_len, mbt, vlen_enc);
 9637   %}
 9638   ins_pipe( pipe_slow );
 9639 %}
 9640 
 9641 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9642   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9643   match(Set dst (VectorMaskTrueCount mask));
 9644   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9645   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9646   ins_encode %{
 9647     int opcode = this->ideal_Opcode();
 9648     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9649     int mask_len = Matcher::vector_length(this, $mask);
 9650     int mask_size = mask_len * type2aelembytes(mbt);
 9651     int vlen_enc = vector_length_encoding(this, $mask);
 9652     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9653                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9654   %}
 9655   ins_pipe( pipe_slow );
 9656 %}
 9657 
 9658 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9659   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9660   match(Set dst (VectorMaskTrueCount mask));
 9661   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9662   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9663   ins_encode %{
 9664     int opcode = this->ideal_Opcode();
 9665     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9666     int mask_len = Matcher::vector_length(this, $mask);
 9667     int vlen_enc = vector_length_encoding(this, $mask);
 9668     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9669                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9670   %}
 9671   ins_pipe( pipe_slow );
 9672 %}
 9673 
 9674 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9675   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9676   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9677   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9678   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9679   ins_encode %{
 9680     int opcode = this->ideal_Opcode();
 9681     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9682     int mask_len = Matcher::vector_length(this, $mask);
 9683     int vlen_enc = vector_length_encoding(this, $mask);
 9684     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9685                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9686   %}
 9687   ins_pipe( pipe_slow );
 9688 %}
 9689 
 9690 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9691   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9692   match(Set dst (VectorMaskFirstTrue mask));
 9693   match(Set dst (VectorMaskLastTrue mask));
 9694   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9695   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9696   ins_encode %{
 9697     int opcode = this->ideal_Opcode();
 9698     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9699     int mask_len = Matcher::vector_length(this, $mask);
 9700     int mask_size = mask_len * type2aelembytes(mbt);
 9701     int vlen_enc = vector_length_encoding(this, $mask);
 9702     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9703                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9704   %}
 9705   ins_pipe( pipe_slow );
 9706 %}
 9707 
 9708 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9709   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9710   match(Set dst (VectorMaskFirstTrue mask));
 9711   match(Set dst (VectorMaskLastTrue mask));
 9712   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9713   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9714   ins_encode %{
 9715     int opcode = this->ideal_Opcode();
 9716     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9717     int mask_len = Matcher::vector_length(this, $mask);
 9718     int vlen_enc = vector_length_encoding(this, $mask);
 9719     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9720                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9721   %}
 9722   ins_pipe( pipe_slow );
 9723 %}
 9724 
 9725 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9726   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9727   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9728   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9729   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9730   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9731   ins_encode %{
 9732     int opcode = this->ideal_Opcode();
 9733     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9734     int mask_len = Matcher::vector_length(this, $mask);
 9735     int vlen_enc = vector_length_encoding(this, $mask);
 9736     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9737                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9738   %}
 9739   ins_pipe( pipe_slow );
 9740 %}
 9741 
 9742 // --------------------------------- Compress/Expand Operations ---------------------------
 9743 #ifdef _LP64
 9744 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9745   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9746   match(Set dst (CompressV src mask));
 9747   match(Set dst (ExpandV src mask));
 9748   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9749   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9750   ins_encode %{
 9751     int opcode = this->ideal_Opcode();
 9752     int vlen_enc = vector_length_encoding(this);
 9753     BasicType bt  = Matcher::vector_element_basic_type(this);
 9754     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9755                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9756   %}
 9757   ins_pipe( pipe_slow );
 9758 %}
 9759 #endif
 9760 
 9761 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9762   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9763   match(Set dst (CompressV src mask));
 9764   match(Set dst (ExpandV src mask));
 9765   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9766   ins_encode %{
 9767     int opcode = this->ideal_Opcode();
 9768     int vector_len = vector_length_encoding(this);
 9769     BasicType bt  = Matcher::vector_element_basic_type(this);
 9770     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9771   %}
 9772   ins_pipe( pipe_slow );
 9773 %}
 9774 
 9775 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9776   match(Set dst (CompressM mask));
 9777   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9778   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9779   ins_encode %{
 9780     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9781     int mask_len = Matcher::vector_length(this);
 9782     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9783   %}
 9784   ins_pipe( pipe_slow );
 9785 %}
 9786 
 9787 #endif // _LP64
 9788 
 9789 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9790 
 9791 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9792   predicate(!VM_Version::supports_gfni());
 9793   match(Set dst (ReverseV src));
 9794   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9795   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9796   ins_encode %{
 9797     int vec_enc = vector_length_encoding(this);
 9798     BasicType bt = Matcher::vector_element_basic_type(this);
 9799     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9800                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9801   %}
 9802   ins_pipe( pipe_slow );
 9803 %}
 9804 
 9805 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9806   predicate(VM_Version::supports_gfni());
 9807   match(Set dst (ReverseV src));
 9808   effect(TEMP dst, TEMP xtmp);
 9809   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9810   ins_encode %{
 9811     int vec_enc = vector_length_encoding(this);
 9812     BasicType bt  = Matcher::vector_element_basic_type(this);
 9813     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9814     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9815                                $xtmp$$XMMRegister);
 9816   %}
 9817   ins_pipe( pipe_slow );
 9818 %}
 9819 
 9820 instruct vreverse_byte_reg(vec dst, vec src) %{
 9821   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9822   match(Set dst (ReverseBytesV src));
 9823   effect(TEMP dst);
 9824   format %{ "vector_reverse_byte $dst, $src" %}
 9825   ins_encode %{
 9826     int vec_enc = vector_length_encoding(this);
 9827     BasicType bt = Matcher::vector_element_basic_type(this);
 9828     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9829   %}
 9830   ins_pipe( pipe_slow );
 9831 %}
 9832 
 9833 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9834   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9835   match(Set dst (ReverseBytesV src));
 9836   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9837   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9838   ins_encode %{
 9839     int vec_enc = vector_length_encoding(this);
 9840     BasicType bt = Matcher::vector_element_basic_type(this);
 9841     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9842                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9843   %}
 9844   ins_pipe( pipe_slow );
 9845 %}
 9846 
 9847 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9848 
 9849 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9850   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9851                                               Matcher::vector_length_in_bytes(n->in(1))));
 9852   match(Set dst (CountLeadingZerosV src));
 9853   format %{ "vector_count_leading_zeros $dst, $src" %}
 9854   ins_encode %{
 9855      int vlen_enc = vector_length_encoding(this, $src);
 9856      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9857      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9858                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9859   %}
 9860   ins_pipe( pipe_slow );
 9861 %}
 9862 
 9863 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9864   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9865                                               Matcher::vector_length_in_bytes(n->in(1))));
 9866   match(Set dst (CountLeadingZerosV src mask));
 9867   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9868   ins_encode %{
 9869     int vlen_enc = vector_length_encoding(this, $src);
 9870     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9871     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9872     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9873                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9874   %}
 9875   ins_pipe( pipe_slow );
 9876 %}
 9877 
 9878 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9879   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9880             VM_Version::supports_avx512cd() &&
 9881             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9882   match(Set dst (CountLeadingZerosV src));
 9883   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9884   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9885   ins_encode %{
 9886     int vlen_enc = vector_length_encoding(this, $src);
 9887     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9888     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9889                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9890   %}
 9891   ins_pipe( pipe_slow );
 9892 %}
 9893 
 9894 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9895   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9896   match(Set dst (CountLeadingZerosV src));
 9897   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9898   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9899   ins_encode %{
 9900     int vlen_enc = vector_length_encoding(this, $src);
 9901     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9902     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9903                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9904                                        $rtmp$$Register, true, vlen_enc);
 9905   %}
 9906   ins_pipe( pipe_slow );
 9907 %}
 9908 
 9909 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9910   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9911             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9912   match(Set dst (CountLeadingZerosV src));
 9913   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9914   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9915   ins_encode %{
 9916     int vlen_enc = vector_length_encoding(this, $src);
 9917     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9918     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9919                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9920   %}
 9921   ins_pipe( pipe_slow );
 9922 %}
 9923 
 9924 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9925   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9926             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9927   match(Set dst (CountLeadingZerosV src));
 9928   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9929   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9930   ins_encode %{
 9931     int vlen_enc = vector_length_encoding(this, $src);
 9932     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9933     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9934                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9935   %}
 9936   ins_pipe( pipe_slow );
 9937 %}
 9938 
 9939 // ---------------------------------- Vector Masked Operations ------------------------------------
 9940 
 9941 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9942   match(Set dst (AddVB (Binary dst src2) mask));
 9943   match(Set dst (AddVS (Binary dst src2) mask));
 9944   match(Set dst (AddVI (Binary dst src2) mask));
 9945   match(Set dst (AddVL (Binary dst src2) mask));
 9946   match(Set dst (AddVF (Binary dst src2) mask));
 9947   match(Set dst (AddVD (Binary dst src2) mask));
 9948   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9949   ins_encode %{
 9950     int vlen_enc = vector_length_encoding(this);
 9951     BasicType bt = Matcher::vector_element_basic_type(this);
 9952     int opc = this->ideal_Opcode();
 9953     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9954                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9955   %}
 9956   ins_pipe( pipe_slow );
 9957 %}
 9958 
 9959 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9960   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9961   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9962   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9963   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9964   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9965   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9966   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9967   ins_encode %{
 9968     int vlen_enc = vector_length_encoding(this);
 9969     BasicType bt = Matcher::vector_element_basic_type(this);
 9970     int opc = this->ideal_Opcode();
 9971     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9972                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9973   %}
 9974   ins_pipe( pipe_slow );
 9975 %}
 9976 
 9977 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9978   match(Set dst (XorV (Binary dst src2) mask));
 9979   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9980   ins_encode %{
 9981     int vlen_enc = vector_length_encoding(this);
 9982     BasicType bt = Matcher::vector_element_basic_type(this);
 9983     int opc = this->ideal_Opcode();
 9984     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9985                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9986   %}
 9987   ins_pipe( pipe_slow );
 9988 %}
 9989 
 9990 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9991   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9992   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9993   ins_encode %{
 9994     int vlen_enc = vector_length_encoding(this);
 9995     BasicType bt = Matcher::vector_element_basic_type(this);
 9996     int opc = this->ideal_Opcode();
 9997     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9998                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9999   %}
10000   ins_pipe( pipe_slow );
10001 %}
10002 
10003 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
10004   match(Set dst (OrV (Binary dst src2) mask));
10005   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
10006   ins_encode %{
10007     int vlen_enc = vector_length_encoding(this);
10008     BasicType bt = Matcher::vector_element_basic_type(this);
10009     int opc = this->ideal_Opcode();
10010     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10011                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10012   %}
10013   ins_pipe( pipe_slow );
10014 %}
10015 
10016 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
10017   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
10018   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
10019   ins_encode %{
10020     int vlen_enc = vector_length_encoding(this);
10021     BasicType bt = Matcher::vector_element_basic_type(this);
10022     int opc = this->ideal_Opcode();
10023     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10024                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10025   %}
10026   ins_pipe( pipe_slow );
10027 %}
10028 
10029 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
10030   match(Set dst (AndV (Binary dst src2) mask));
10031   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
10032   ins_encode %{
10033     int vlen_enc = vector_length_encoding(this);
10034     BasicType bt = Matcher::vector_element_basic_type(this);
10035     int opc = this->ideal_Opcode();
10036     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10037                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10038   %}
10039   ins_pipe( pipe_slow );
10040 %}
10041 
10042 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
10043   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
10044   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
10045   ins_encode %{
10046     int vlen_enc = vector_length_encoding(this);
10047     BasicType bt = Matcher::vector_element_basic_type(this);
10048     int opc = this->ideal_Opcode();
10049     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10050                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10051   %}
10052   ins_pipe( pipe_slow );
10053 %}
10054 
10055 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
10056   match(Set dst (SubVB (Binary dst src2) mask));
10057   match(Set dst (SubVS (Binary dst src2) mask));
10058   match(Set dst (SubVI (Binary dst src2) mask));
10059   match(Set dst (SubVL (Binary dst src2) mask));
10060   match(Set dst (SubVF (Binary dst src2) mask));
10061   match(Set dst (SubVD (Binary dst src2) mask));
10062   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
10063   ins_encode %{
10064     int vlen_enc = vector_length_encoding(this);
10065     BasicType bt = Matcher::vector_element_basic_type(this);
10066     int opc = this->ideal_Opcode();
10067     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10068                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10069   %}
10070   ins_pipe( pipe_slow );
10071 %}
10072 
10073 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
10074   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
10075   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
10076   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
10077   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
10078   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
10079   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
10080   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
10081   ins_encode %{
10082     int vlen_enc = vector_length_encoding(this);
10083     BasicType bt = Matcher::vector_element_basic_type(this);
10084     int opc = this->ideal_Opcode();
10085     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10086                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10087   %}
10088   ins_pipe( pipe_slow );
10089 %}
10090 
10091 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
10092   match(Set dst (MulVS (Binary dst src2) mask));
10093   match(Set dst (MulVI (Binary dst src2) mask));
10094   match(Set dst (MulVL (Binary dst src2) mask));
10095   match(Set dst (MulVF (Binary dst src2) mask));
10096   match(Set dst (MulVD (Binary dst src2) mask));
10097   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10098   ins_encode %{
10099     int vlen_enc = vector_length_encoding(this);
10100     BasicType bt = Matcher::vector_element_basic_type(this);
10101     int opc = this->ideal_Opcode();
10102     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10103                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10104   %}
10105   ins_pipe( pipe_slow );
10106 %}
10107 
10108 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
10109   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
10110   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
10111   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
10112   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
10113   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
10114   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10115   ins_encode %{
10116     int vlen_enc = vector_length_encoding(this);
10117     BasicType bt = Matcher::vector_element_basic_type(this);
10118     int opc = this->ideal_Opcode();
10119     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10120                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10121   %}
10122   ins_pipe( pipe_slow );
10123 %}
10124 
10125 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
10126   match(Set dst (SqrtVF dst mask));
10127   match(Set dst (SqrtVD dst mask));
10128   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
10129   ins_encode %{
10130     int vlen_enc = vector_length_encoding(this);
10131     BasicType bt = Matcher::vector_element_basic_type(this);
10132     int opc = this->ideal_Opcode();
10133     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10134                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10135   %}
10136   ins_pipe( pipe_slow );
10137 %}
10138 
10139 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
10140   match(Set dst (DivVF (Binary dst src2) mask));
10141   match(Set dst (DivVD (Binary dst src2) mask));
10142   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10143   ins_encode %{
10144     int vlen_enc = vector_length_encoding(this);
10145     BasicType bt = Matcher::vector_element_basic_type(this);
10146     int opc = this->ideal_Opcode();
10147     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10148                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10149   %}
10150   ins_pipe( pipe_slow );
10151 %}
10152 
10153 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
10154   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
10155   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
10156   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10157   ins_encode %{
10158     int vlen_enc = vector_length_encoding(this);
10159     BasicType bt = Matcher::vector_element_basic_type(this);
10160     int opc = this->ideal_Opcode();
10161     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10162                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10163   %}
10164   ins_pipe( pipe_slow );
10165 %}
10166 
10167 
10168 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10169   match(Set dst (RotateLeftV (Binary dst shift) mask));
10170   match(Set dst (RotateRightV (Binary dst shift) mask));
10171   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10172   ins_encode %{
10173     int vlen_enc = vector_length_encoding(this);
10174     BasicType bt = Matcher::vector_element_basic_type(this);
10175     int opc = this->ideal_Opcode();
10176     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10177                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10178   %}
10179   ins_pipe( pipe_slow );
10180 %}
10181 
10182 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10183   match(Set dst (RotateLeftV (Binary dst src2) mask));
10184   match(Set dst (RotateRightV (Binary dst src2) mask));
10185   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10186   ins_encode %{
10187     int vlen_enc = vector_length_encoding(this);
10188     BasicType bt = Matcher::vector_element_basic_type(this);
10189     int opc = this->ideal_Opcode();
10190     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10191                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10192   %}
10193   ins_pipe( pipe_slow );
10194 %}
10195 
10196 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10197   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10198   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10199   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10200   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10201   ins_encode %{
10202     int vlen_enc = vector_length_encoding(this);
10203     BasicType bt = Matcher::vector_element_basic_type(this);
10204     int opc = this->ideal_Opcode();
10205     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10206                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10207   %}
10208   ins_pipe( pipe_slow );
10209 %}
10210 
10211 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10212   predicate(!n->as_ShiftV()->is_var_shift());
10213   match(Set dst (LShiftVS (Binary dst src2) mask));
10214   match(Set dst (LShiftVI (Binary dst src2) mask));
10215   match(Set dst (LShiftVL (Binary dst src2) mask));
10216   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10217   ins_encode %{
10218     int vlen_enc = vector_length_encoding(this);
10219     BasicType bt = Matcher::vector_element_basic_type(this);
10220     int opc = this->ideal_Opcode();
10221     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10222                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10223   %}
10224   ins_pipe( pipe_slow );
10225 %}
10226 
10227 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10228   predicate(n->as_ShiftV()->is_var_shift());
10229   match(Set dst (LShiftVS (Binary dst src2) mask));
10230   match(Set dst (LShiftVI (Binary dst src2) mask));
10231   match(Set dst (LShiftVL (Binary dst src2) mask));
10232   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10233   ins_encode %{
10234     int vlen_enc = vector_length_encoding(this);
10235     BasicType bt = Matcher::vector_element_basic_type(this);
10236     int opc = this->ideal_Opcode();
10237     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10238                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10239   %}
10240   ins_pipe( pipe_slow );
10241 %}
10242 
10243 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10244   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10245   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10246   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10247   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10248   ins_encode %{
10249     int vlen_enc = vector_length_encoding(this);
10250     BasicType bt = Matcher::vector_element_basic_type(this);
10251     int opc = this->ideal_Opcode();
10252     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10253                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10254   %}
10255   ins_pipe( pipe_slow );
10256 %}
10257 
10258 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10259   predicate(!n->as_ShiftV()->is_var_shift());
10260   match(Set dst (RShiftVS (Binary dst src2) mask));
10261   match(Set dst (RShiftVI (Binary dst src2) mask));
10262   match(Set dst (RShiftVL (Binary dst src2) mask));
10263   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10264   ins_encode %{
10265     int vlen_enc = vector_length_encoding(this);
10266     BasicType bt = Matcher::vector_element_basic_type(this);
10267     int opc = this->ideal_Opcode();
10268     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10269                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10270   %}
10271   ins_pipe( pipe_slow );
10272 %}
10273 
10274 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10275   predicate(n->as_ShiftV()->is_var_shift());
10276   match(Set dst (RShiftVS (Binary dst src2) mask));
10277   match(Set dst (RShiftVI (Binary dst src2) mask));
10278   match(Set dst (RShiftVL (Binary dst src2) mask));
10279   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10280   ins_encode %{
10281     int vlen_enc = vector_length_encoding(this);
10282     BasicType bt = Matcher::vector_element_basic_type(this);
10283     int opc = this->ideal_Opcode();
10284     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10285                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10286   %}
10287   ins_pipe( pipe_slow );
10288 %}
10289 
10290 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10291   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10292   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10293   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10294   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10295   ins_encode %{
10296     int vlen_enc = vector_length_encoding(this);
10297     BasicType bt = Matcher::vector_element_basic_type(this);
10298     int opc = this->ideal_Opcode();
10299     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10300                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10301   %}
10302   ins_pipe( pipe_slow );
10303 %}
10304 
10305 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10306   predicate(!n->as_ShiftV()->is_var_shift());
10307   match(Set dst (URShiftVS (Binary dst src2) mask));
10308   match(Set dst (URShiftVI (Binary dst src2) mask));
10309   match(Set dst (URShiftVL (Binary dst src2) mask));
10310   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10311   ins_encode %{
10312     int vlen_enc = vector_length_encoding(this);
10313     BasicType bt = Matcher::vector_element_basic_type(this);
10314     int opc = this->ideal_Opcode();
10315     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10316                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10317   %}
10318   ins_pipe( pipe_slow );
10319 %}
10320 
10321 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10322   predicate(n->as_ShiftV()->is_var_shift());
10323   match(Set dst (URShiftVS (Binary dst src2) mask));
10324   match(Set dst (URShiftVI (Binary dst src2) mask));
10325   match(Set dst (URShiftVL (Binary dst src2) mask));
10326   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10327   ins_encode %{
10328     int vlen_enc = vector_length_encoding(this);
10329     BasicType bt = Matcher::vector_element_basic_type(this);
10330     int opc = this->ideal_Opcode();
10331     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10332                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10333   %}
10334   ins_pipe( pipe_slow );
10335 %}
10336 
10337 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10338   match(Set dst (MaxV (Binary dst src2) mask));
10339   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10340   ins_encode %{
10341     int vlen_enc = vector_length_encoding(this);
10342     BasicType bt = Matcher::vector_element_basic_type(this);
10343     int opc = this->ideal_Opcode();
10344     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10345                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10346   %}
10347   ins_pipe( pipe_slow );
10348 %}
10349 
10350 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10351   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10352   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10353   ins_encode %{
10354     int vlen_enc = vector_length_encoding(this);
10355     BasicType bt = Matcher::vector_element_basic_type(this);
10356     int opc = this->ideal_Opcode();
10357     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10358                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10359   %}
10360   ins_pipe( pipe_slow );
10361 %}
10362 
10363 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10364   match(Set dst (MinV (Binary dst src2) mask));
10365   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10366   ins_encode %{
10367     int vlen_enc = vector_length_encoding(this);
10368     BasicType bt = Matcher::vector_element_basic_type(this);
10369     int opc = this->ideal_Opcode();
10370     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10371                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10372   %}
10373   ins_pipe( pipe_slow );
10374 %}
10375 
10376 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10377   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10378   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10379   ins_encode %{
10380     int vlen_enc = vector_length_encoding(this);
10381     BasicType bt = Matcher::vector_element_basic_type(this);
10382     int opc = this->ideal_Opcode();
10383     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10384                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10385   %}
10386   ins_pipe( pipe_slow );
10387 %}
10388 
10389 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10390   match(Set dst (VectorRearrange (Binary dst src2) mask));
10391   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10392   ins_encode %{
10393     int vlen_enc = vector_length_encoding(this);
10394     BasicType bt = Matcher::vector_element_basic_type(this);
10395     int opc = this->ideal_Opcode();
10396     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10397                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10398   %}
10399   ins_pipe( pipe_slow );
10400 %}
10401 
10402 instruct vabs_masked(vec dst, kReg mask) %{
10403   match(Set dst (AbsVB dst mask));
10404   match(Set dst (AbsVS dst mask));
10405   match(Set dst (AbsVI dst mask));
10406   match(Set dst (AbsVL dst mask));
10407   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10408   ins_encode %{
10409     int vlen_enc = vector_length_encoding(this);
10410     BasicType bt = Matcher::vector_element_basic_type(this);
10411     int opc = this->ideal_Opcode();
10412     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10413                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10414   %}
10415   ins_pipe( pipe_slow );
10416 %}
10417 
10418 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10419   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10420   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10421   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10422   ins_encode %{
10423     assert(UseFMA, "Needs FMA instructions support.");
10424     int vlen_enc = vector_length_encoding(this);
10425     BasicType bt = Matcher::vector_element_basic_type(this);
10426     int opc = this->ideal_Opcode();
10427     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10428                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10429   %}
10430   ins_pipe( pipe_slow );
10431 %}
10432 
10433 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10434   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10435   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10436   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10437   ins_encode %{
10438     assert(UseFMA, "Needs FMA instructions support.");
10439     int vlen_enc = vector_length_encoding(this);
10440     BasicType bt = Matcher::vector_element_basic_type(this);
10441     int opc = this->ideal_Opcode();
10442     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10443                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10444   %}
10445   ins_pipe( pipe_slow );
10446 %}
10447 
10448 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10449   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10450   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10451   ins_encode %{
10452     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10453     int vlen_enc = vector_length_encoding(this, $src1);
10454     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10455 
10456     // Comparison i
10457     switch (src1_elem_bt) {
10458       case T_BYTE: {
10459         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10460         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10461         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10462         break;
10463       }
10464       case T_SHORT: {
10465         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10466         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10467         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10468         break;
10469       }
10470       case T_INT: {
10471         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10472         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10473         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10474         break;
10475       }
10476       case T_LONG: {
10477         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10478         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10479         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10480         break;
10481       }
10482       case T_FLOAT: {
10483         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10484         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10485         break;
10486       }
10487       case T_DOUBLE: {
10488         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10489         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10490         break;
10491       }
10492       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10493     }
10494   %}
10495   ins_pipe( pipe_slow );
10496 %}
10497 
10498 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10499   predicate(Matcher::vector_length(n) <= 32);
10500   match(Set dst (MaskAll src));
10501   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10502   ins_encode %{
10503     int mask_len = Matcher::vector_length(this);
10504     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10505   %}
10506   ins_pipe( pipe_slow );
10507 %}
10508 
10509 #ifdef _LP64
10510 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10511   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10512   match(Set dst (XorVMask src (MaskAll cnt)));
10513   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10514   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10515   ins_encode %{
10516     uint masklen = Matcher::vector_length(this);
10517     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10518   %}
10519   ins_pipe( pipe_slow );
10520 %}
10521 
10522 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10523   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10524             (Matcher::vector_length(n) == 16) ||
10525             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10526   match(Set dst (XorVMask src (MaskAll cnt)));
10527   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10528   ins_encode %{
10529     uint masklen = Matcher::vector_length(this);
10530     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10531   %}
10532   ins_pipe( pipe_slow );
10533 %}
10534 
10535 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10536   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10537   match(Set dst (VectorLongToMask src));
10538   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10539   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10540   ins_encode %{
10541     int mask_len = Matcher::vector_length(this);
10542     int vec_enc  = vector_length_encoding(mask_len);
10543     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10544                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10545   %}
10546   ins_pipe( pipe_slow );
10547 %}
10548 
10549 
10550 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10551   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10552   match(Set dst (VectorLongToMask src));
10553   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10554   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10555   ins_encode %{
10556     int mask_len = Matcher::vector_length(this);
10557     assert(mask_len <= 32, "invalid mask length");
10558     int vec_enc  = vector_length_encoding(mask_len);
10559     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10560                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10561   %}
10562   ins_pipe( pipe_slow );
10563 %}
10564 
10565 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10566   predicate(n->bottom_type()->isa_vectmask());
10567   match(Set dst (VectorLongToMask src));
10568   format %{ "long_to_mask_evex $dst, $src\t!" %}
10569   ins_encode %{
10570     __ kmov($dst$$KRegister, $src$$Register);
10571   %}
10572   ins_pipe( pipe_slow );
10573 %}
10574 #endif
10575 
10576 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10577   match(Set dst (AndVMask src1 src2));
10578   match(Set dst (OrVMask src1 src2));
10579   match(Set dst (XorVMask src1 src2));
10580   effect(TEMP kscratch);
10581   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10582   ins_encode %{
10583     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10584     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10585     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10586     uint masklen = Matcher::vector_length(this);
10587     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10588     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10589   %}
10590   ins_pipe( pipe_slow );
10591 %}
10592 
10593 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10594   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10595   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10596   ins_encode %{
10597     int vlen_enc = vector_length_encoding(this);
10598     BasicType bt = Matcher::vector_element_basic_type(this);
10599     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10600                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10601   %}
10602   ins_pipe( pipe_slow );
10603 %}
10604 
10605 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10606   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10607   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10608   ins_encode %{
10609     int vlen_enc = vector_length_encoding(this);
10610     BasicType bt = Matcher::vector_element_basic_type(this);
10611     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10612                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10613   %}
10614   ins_pipe( pipe_slow );
10615 %}
10616 
10617 instruct castMM(kReg dst)
10618 %{
10619   match(Set dst (CastVV dst));
10620 
10621   size(0);
10622   format %{ "# castVV of $dst" %}
10623   ins_encode(/* empty encoding */);
10624   ins_cost(0);
10625   ins_pipe(empty);
10626 %}
10627 
10628 instruct castVV(vec dst)
10629 %{
10630   match(Set dst (CastVV dst));
10631 
10632   size(0);
10633   format %{ "# castVV of $dst" %}
10634   ins_encode(/* empty encoding */);
10635   ins_cost(0);
10636   ins_pipe(empty);
10637 %}
10638 
10639 instruct castVVLeg(legVec dst)
10640 %{
10641   match(Set dst (CastVV dst));
10642 
10643   size(0);
10644   format %{ "# castVV of $dst" %}
10645   ins_encode(/* empty encoding */);
10646   ins_cost(0);
10647   ins_pipe(empty);
10648 %}
10649 
10650 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10651 %{
10652   match(Set dst (IsInfiniteF src));
10653   effect(TEMP ktmp, KILL cr);
10654   format %{ "float_class_check $dst, $src" %}
10655   ins_encode %{
10656     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10657     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10658   %}
10659   ins_pipe(pipe_slow);
10660 %}
10661 
10662 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10663 %{
10664   match(Set dst (IsInfiniteD src));
10665   effect(TEMP ktmp, KILL cr);
10666   format %{ "double_class_check $dst, $src" %}
10667   ins_encode %{
10668     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10669     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10670   %}
10671   ins_pipe(pipe_slow);
10672 %}
10673 
10674 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10675 %{
10676   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10677             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10678   match(Set dst (SaturatingAddV src1 src2));
10679   match(Set dst (SaturatingSubV src1 src2));
10680   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10681   ins_encode %{
10682     int vlen_enc = vector_length_encoding(this);
10683     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10684     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10685                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10686   %}
10687   ins_pipe(pipe_slow);
10688 %}
10689 
10690 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10691 %{
10692   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10693             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10694   match(Set dst (SaturatingAddV src1 src2));
10695   match(Set dst (SaturatingSubV src1 src2));
10696   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10697   ins_encode %{
10698     int vlen_enc = vector_length_encoding(this);
10699     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10700     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10701                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10702   %}
10703   ins_pipe(pipe_slow);
10704 %}
10705 
10706 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10707 %{
10708   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10709             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10710             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10711   match(Set dst (SaturatingAddV src1 src2));
10712   match(Set dst (SaturatingSubV src1 src2));
10713   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10714   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10715   ins_encode %{
10716     int vlen_enc = vector_length_encoding(this);
10717     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10718     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10719                                         $src1$$XMMRegister, $src2$$XMMRegister,
10720                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10721                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10722   %}
10723   ins_pipe(pipe_slow);
10724 %}
10725 
10726 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10727 %{
10728   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10729             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10730             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10731   match(Set dst (SaturatingAddV src1 src2));
10732   match(Set dst (SaturatingSubV src1 src2));
10733   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10734   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10735   ins_encode %{
10736     int vlen_enc = vector_length_encoding(this);
10737     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10738     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10739                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10740                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10741   %}
10742   ins_pipe(pipe_slow);
10743 %}
10744 
10745 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10746 %{
10747   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10748             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10749             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10750   match(Set dst (SaturatingAddV src1 src2));
10751   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10752   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10753   ins_encode %{
10754     int vlen_enc = vector_length_encoding(this);
10755     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10756     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10757                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10758   %}
10759   ins_pipe(pipe_slow);
10760 %}
10761 
10762 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10763 %{
10764   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10765             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10766             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10767   match(Set dst (SaturatingAddV src1 src2));
10768   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10769   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10770   ins_encode %{
10771     int vlen_enc = vector_length_encoding(this);
10772     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10773     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10774                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10775   %}
10776   ins_pipe(pipe_slow);
10777 %}
10778 
10779 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10780 %{
10781   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10782             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10783             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10784   match(Set dst (SaturatingSubV src1 src2));
10785   effect(TEMP ktmp);
10786   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10787   ins_encode %{
10788     int vlen_enc = vector_length_encoding(this);
10789     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10790     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10791                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10792   %}
10793   ins_pipe(pipe_slow);
10794 %}
10795 
10796 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10797 %{
10798   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10799             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10800             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10801   match(Set dst (SaturatingSubV src1 src2));
10802   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10803   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10804   ins_encode %{
10805     int vlen_enc = vector_length_encoding(this);
10806     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10807     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10808                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10809   %}
10810   ins_pipe(pipe_slow);
10811 %}
10812 
10813 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10814 %{
10815   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10816             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10817   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10818   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10819   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10820   ins_encode %{
10821     int vlen_enc = vector_length_encoding(this);
10822     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10823     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10824                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10825   %}
10826   ins_pipe(pipe_slow);
10827 %}
10828 
10829 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10830 %{
10831   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10832             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10833   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10834   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10835   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10836   ins_encode %{
10837     int vlen_enc = vector_length_encoding(this);
10838     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10839     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10840                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10841   %}
10842   ins_pipe(pipe_slow);
10843 %}
10844 
10845 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10846   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10847             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10848   match(Set dst (SaturatingAddV (Binary dst src) mask));
10849   match(Set dst (SaturatingSubV (Binary dst src) mask));
10850   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10851   ins_encode %{
10852     int vlen_enc = vector_length_encoding(this);
10853     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10854     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10855                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10856   %}
10857   ins_pipe( pipe_slow );
10858 %}
10859 
10860 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10861   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10862             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10863   match(Set dst (SaturatingAddV (Binary dst src) mask));
10864   match(Set dst (SaturatingSubV (Binary dst src) mask));
10865   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10866   ins_encode %{
10867     int vlen_enc = vector_length_encoding(this);
10868     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10869     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10870                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10871   %}
10872   ins_pipe( pipe_slow );
10873 %}
10874 
10875 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10876   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10877             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10878   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10879   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10880   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10881   ins_encode %{
10882     int vlen_enc = vector_length_encoding(this);
10883     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10884     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10885                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10886   %}
10887   ins_pipe( pipe_slow );
10888 %}
10889 
10890 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10891   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10892             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10893   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10894   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10895   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10896   ins_encode %{
10897     int vlen_enc = vector_length_encoding(this);
10898     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10899     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10900                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10901   %}
10902   ins_pipe( pipe_slow );
10903 %}
10904 
10905 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10906 %{
10907   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10908   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10909   ins_encode %{
10910     int vlen_enc = vector_length_encoding(this);
10911     BasicType bt = Matcher::vector_element_basic_type(this);
10912     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10913   %}
10914   ins_pipe(pipe_slow);
10915 %}
10916 
10917 instruct reinterpretS2HF(regF dst, rRegI src)
10918 %{
10919   match(Set dst (ReinterpretS2HF src));
10920   format %{ "vmovw $dst, $src" %}
10921   ins_encode %{
10922     __ vmovw($dst$$XMMRegister, $src$$Register);
10923   %}
10924   ins_pipe(pipe_slow);
10925 %}
10926 
10927 instruct convF2HFAndS2HF(regF dst, regF src)
10928 %{
10929   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10930   format %{ "convF2HFAndS2HF $dst, $src" %}
10931   ins_encode %{
10932     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10933   %}
10934   ins_pipe(pipe_slow);
10935 %}
10936 
10937 instruct convHF2SAndHF2F(regF dst, regF src)
10938 %{
10939   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10940   format %{ "convHF2SAndHF2F $dst, $src" %}
10941   ins_encode %{
10942     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10943   %}
10944   ins_pipe(pipe_slow);
10945 %}
10946 
10947 instruct reinterpretHF2S(rRegI dst, regF src)
10948 %{
10949   match(Set dst (ReinterpretHF2S src));
10950   format %{ "vmovw $dst, $src" %}
10951   ins_encode %{
10952     __ vmovw($dst$$Register, $src$$XMMRegister);
10953   %}
10954   ins_pipe(pipe_slow);
10955 %}
10956 
10957 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10958 %{
10959   match(Set dst (SqrtHF src));
10960   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10961   ins_encode %{
10962     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10963   %}
10964   ins_pipe(pipe_slow);
10965 %}
10966 
10967 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10968 %{
10969   match(Set dst (AddHF src1 src2));
10970   match(Set dst (DivHF src1 src2));
10971   match(Set dst (MaxHF src1 src2));
10972   match(Set dst (MinHF src1 src2));
10973   match(Set dst (MulHF src1 src2));
10974   match(Set dst (SubHF src1 src2));
10975   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10976   ins_encode %{
10977     int opcode = this->ideal_Opcode();
10978     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10979   %}
10980   ins_pipe(pipe_slow);
10981 %}
10982 
10983 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10984 %{
10985   match(Set dst (FmaHF  src2 (Binary dst src1)));
10986   effect(DEF dst);
10987   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10988   ins_encode %{
10989     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10990   %}
10991   ins_pipe( pipe_slow );
10992 %}