1 //
    2 // Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(C2_MacroAssembler *masm);
 1191   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   address base = __ start_a_stub(size_exception_handler());
 1314   if (base == nullptr) {
 1315     ciEnv::current()->record_failure("CodeCache is full");
 1316     return 0;  // CodeBuffer::expand failed
 1317   }
 1318   int offset = __ offset();
 1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1321   __ end_a_stub();
 1322   return offset;
 1323 }
 1324 
 1325 // Emit deopt handler code.
 1326 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1327 
 1328   // Note that the code buffer's insts_mark is always relative to insts.
 1329   // That's why we must use the macroassembler to generate a handler.
 1330   address base = __ start_a_stub(size_deopt_handler());
 1331   if (base == nullptr) {
 1332     ciEnv::current()->record_failure("CodeCache is full");
 1333     return 0;  // CodeBuffer::expand failed
 1334   }
 1335   int offset = __ offset();
 1336 
 1337 #ifdef _LP64
 1338   address the_pc = (address) __ pc();
 1339   Label next;
 1340   // push a "the_pc" on the stack without destroying any registers
 1341   // as they all may be live.
 1342 
 1343   // push address of "next"
 1344   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1345   __ bind(next);
 1346   // adjust it so it matches "the_pc"
 1347   __ subptr(Address(rsp, 0), __ offset() - offset);
 1348 #else
 1349   InternalAddress here(__ pc());
 1350   __ pushptr(here.addr(), noreg);
 1351 #endif
 1352 
 1353   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1354   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1355   __ end_a_stub();
 1356   return offset;
 1357 }
 1358 
 1359 static Assembler::Width widthForType(BasicType bt) {
 1360   if (bt == T_BYTE) {
 1361     return Assembler::B;
 1362   } else if (bt == T_SHORT) {
 1363     return Assembler::W;
 1364   } else if (bt == T_INT) {
 1365     return Assembler::D;
 1366   } else {
 1367     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1368     return Assembler::Q;
 1369   }
 1370 }
 1371 
 1372 //=============================================================================
 1373 
 1374   // Float masks come from different places depending on platform.
 1375 #ifdef _LP64
 1376   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1377   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1378   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1379   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1380 #else
 1381   static address float_signmask()  { return (address)float_signmask_pool; }
 1382   static address float_signflip()  { return (address)float_signflip_pool; }
 1383   static address double_signmask() { return (address)double_signmask_pool; }
 1384   static address double_signflip() { return (address)double_signflip_pool; }
 1385 #endif
 1386   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1387   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1388   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1389   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1390   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1391   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1392   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1393   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1394   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1395   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1396   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1397   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1398   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1399   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1400   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1401 
 1402 //=============================================================================
 1403 bool Matcher::match_rule_supported(int opcode) {
 1404   if (!has_match_rule(opcode)) {
 1405     return false; // no match rule present
 1406   }
 1407   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1408   switch (opcode) {
 1409     case Op_AbsVL:
 1410     case Op_StoreVectorScatter:
 1411       if (UseAVX < 3) {
 1412         return false;
 1413       }
 1414       break;
 1415     case Op_PopCountI:
 1416     case Op_PopCountL:
 1417       if (!UsePopCountInstruction) {
 1418         return false;
 1419       }
 1420       break;
 1421     case Op_PopCountVI:
 1422       if (UseAVX < 2) {
 1423         return false;
 1424       }
 1425       break;
 1426     case Op_CompressV:
 1427     case Op_ExpandV:
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572     case Op_LoadVectorGatherMasked:
 1573       if (UseAVX < 2) {
 1574         return false;
 1575       }
 1576       break;
 1577     case Op_FmaF:
 1578     case Op_FmaD:
 1579     case Op_FmaVD:
 1580     case Op_FmaVF:
 1581       if (!UseFMA) {
 1582         return false;
 1583       }
 1584       break;
 1585     case Op_MacroLogicV:
 1586       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1587         return false;
 1588       }
 1589       break;
 1590 
 1591     case Op_VectorCmpMasked:
 1592     case Op_VectorMaskGen:
 1593       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1594         return false;
 1595       }
 1596       break;
 1597     case Op_VectorMaskFirstTrue:
 1598     case Op_VectorMaskLastTrue:
 1599     case Op_VectorMaskTrueCount:
 1600     case Op_VectorMaskToLong:
 1601       if (!is_LP64 || UseAVX < 1) {
 1602          return false;
 1603       }
 1604       break;
 1605     case Op_RoundF:
 1606     case Op_RoundD:
 1607       if (!is_LP64) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_CopySignD:
 1612     case Op_CopySignF:
 1613       if (UseAVX < 3 || !is_LP64)  {
 1614         return false;
 1615       }
 1616       if (!VM_Version::supports_avx512vl()) {
 1617         return false;
 1618       }
 1619       break;
 1620 #ifndef _LP64
 1621     case Op_AddReductionVF:
 1622     case Op_AddReductionVD:
 1623     case Op_MulReductionVF:
 1624     case Op_MulReductionVD:
 1625       if (UseSSE < 1) { // requires at least SSE
 1626         return false;
 1627       }
 1628       break;
 1629     case Op_MulAddVS2VI:
 1630     case Op_RShiftVL:
 1631     case Op_AbsVD:
 1632     case Op_NegVD:
 1633       if (UseSSE < 2) {
 1634         return false;
 1635       }
 1636       break;
 1637 #endif // !LP64
 1638     case Op_CompressBits:
 1639       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_ExpandBits:
 1644       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1645         return false;
 1646       }
 1647       break;
 1648     case Op_SignumF:
 1649       if (UseSSE < 1) {
 1650         return false;
 1651       }
 1652       break;
 1653     case Op_SignumD:
 1654       if (UseSSE < 2) {
 1655         return false;
 1656       }
 1657       break;
 1658     case Op_CompressM:
 1659       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1660         return false;
 1661       }
 1662       break;
 1663     case Op_SqrtF:
 1664       if (UseSSE < 1) {
 1665         return false;
 1666       }
 1667       break;
 1668     case Op_SqrtD:
 1669 #ifdef _LP64
 1670       if (UseSSE < 2) {
 1671         return false;
 1672       }
 1673 #else
 1674       // x86_32.ad has a special match rule for SqrtD.
 1675       // Together with common x86 rules, this handles all UseSSE cases.
 1676 #endif
 1677       break;
 1678     case Op_ConvF2HF:
 1679     case Op_ConvHF2F:
 1680       if (!VM_Version::supports_float16()) {
 1681         return false;
 1682       }
 1683       break;
 1684     case Op_VectorCastF2HF:
 1685     case Op_VectorCastHF2F:
 1686       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1687         return false;
 1688       }
 1689       break;
 1690   }
 1691   return true;  // Match rules are supported by default.
 1692 }
 1693 
 1694 //------------------------------------------------------------------------
 1695 
 1696 static inline bool is_pop_count_instr_target(BasicType bt) {
 1697   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1698          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1699 }
 1700 
 1701 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1702   return match_rule_supported_vector(opcode, vlen, bt);
 1703 }
 1704 
 1705 // Identify extra cases that we might want to provide match rules for vector nodes and
 1706 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1707 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1708   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1709   if (!match_rule_supported(opcode)) {
 1710     return false;
 1711   }
 1712   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1713   //   * SSE2 supports 128bit vectors for all types;
 1714   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1715   //   * AVX2 supports 256bit vectors for all types;
 1716   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1717   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1718   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1719   // And MaxVectorSize is taken into account as well.
 1720   if (!vector_size_supported(bt, vlen)) {
 1721     return false;
 1722   }
 1723   // Special cases which require vector length follow:
 1724   //   * implementation limitations
 1725   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1726   //   * 128bit vroundpd instruction is present only in AVX1
 1727   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1728   switch (opcode) {
 1729     case Op_AbsVF:
 1730     case Op_NegVF:
 1731       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1732         return false; // 512bit vandps and vxorps are not available
 1733       }
 1734       break;
 1735     case Op_AbsVD:
 1736     case Op_NegVD:
 1737       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1738         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1739       }
 1740       break;
 1741     case Op_RotateRightV:
 1742     case Op_RotateLeftV:
 1743       if (bt != T_INT && bt != T_LONG) {
 1744         return false;
 1745       } // fallthrough
 1746     case Op_MacroLogicV:
 1747       if (!VM_Version::supports_evex() ||
 1748           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1749         return false;
 1750       }
 1751       break;
 1752     case Op_ClearArray:
 1753     case Op_VectorMaskGen:
 1754     case Op_VectorCmpMasked:
 1755       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1756         return false;
 1757       }
 1758       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1759         return false;
 1760       }
 1761       break;
 1762     case Op_LoadVectorMasked:
 1763     case Op_StoreVectorMasked:
 1764       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1765         return false;
 1766       }
 1767       break;
 1768     case Op_UMinV:
 1769     case Op_UMaxV:
 1770       if (UseAVX == 0) {
 1771         return false;
 1772       }
 1773       break;
 1774     case Op_MaxV:
 1775     case Op_MinV:
 1776       if (UseSSE < 4 && is_integral_type(bt)) {
 1777         return false;
 1778       }
 1779       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1780           // Float/Double intrinsics are enabled for AVX family currently.
 1781           if (UseAVX == 0) {
 1782             return false;
 1783           }
 1784           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1785             return false;
 1786           }
 1787       }
 1788       break;
 1789     case Op_CallLeafVector:
 1790       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1791         return false;
 1792       }
 1793       break;
 1794     case Op_AddReductionVI:
 1795       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1796         return false;
 1797       }
 1798       // fallthrough
 1799     case Op_AndReductionV:
 1800     case Op_OrReductionV:
 1801     case Op_XorReductionV:
 1802       if (is_subword_type(bt) && (UseSSE < 4)) {
 1803         return false;
 1804       }
 1805 #ifndef _LP64
 1806       if (bt == T_BYTE || bt == T_LONG) {
 1807         return false;
 1808       }
 1809 #endif
 1810       break;
 1811 #ifndef _LP64
 1812     case Op_VectorInsert:
 1813       if (bt == T_LONG || bt == T_DOUBLE) {
 1814         return false;
 1815       }
 1816       break;
 1817 #endif
 1818     case Op_MinReductionV:
 1819     case Op_MaxReductionV:
 1820       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1821         return false;
 1822       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1823         return false;
 1824       }
 1825       // Float/Double intrinsics enabled for AVX family.
 1826       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1827         return false;
 1828       }
 1829       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1830         return false;
 1831       }
 1832 #ifndef _LP64
 1833       if (bt == T_BYTE || bt == T_LONG) {
 1834         return false;
 1835       }
 1836 #endif
 1837       break;
 1838     case Op_VectorTest:
 1839       if (UseSSE < 4) {
 1840         return false; // Implementation limitation
 1841       } else if (size_in_bits < 32) {
 1842         return false; // Implementation limitation
 1843       }
 1844       break;
 1845     case Op_VectorLoadShuffle:
 1846     case Op_VectorRearrange:
 1847       if(vlen == 2) {
 1848         return false; // Implementation limitation due to how shuffle is loaded
 1849       } else if (size_in_bits == 256 && UseAVX < 2) {
 1850         return false; // Implementation limitation
 1851       }
 1852       break;
 1853     case Op_VectorLoadMask:
 1854     case Op_VectorMaskCast:
 1855       if (size_in_bits == 256 && UseAVX < 2) {
 1856         return false; // Implementation limitation
 1857       }
 1858       // fallthrough
 1859     case Op_VectorStoreMask:
 1860       if (vlen == 2) {
 1861         return false; // Implementation limitation
 1862       }
 1863       break;
 1864     case Op_PopulateIndex:
 1865       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1866         return false;
 1867       }
 1868       break;
 1869     case Op_VectorCastB2X:
 1870     case Op_VectorCastS2X:
 1871     case Op_VectorCastI2X:
 1872       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1873         return false;
 1874       }
 1875       break;
 1876     case Op_VectorCastL2X:
 1877       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1878         return false;
 1879       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1880         return false;
 1881       }
 1882       break;
 1883     case Op_VectorCastF2X: {
 1884         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1885         // happen after intermediate conversion to integer and special handling
 1886         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1887         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1888         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1889           return false;
 1890         }
 1891       }
 1892       // fallthrough
 1893     case Op_VectorCastD2X:
 1894       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1895         return false;
 1896       }
 1897       break;
 1898     case Op_VectorCastF2HF:
 1899     case Op_VectorCastHF2F:
 1900       if (!VM_Version::supports_f16c() &&
 1901          ((!VM_Version::supports_evex() ||
 1902          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1903         return false;
 1904       }
 1905       break;
 1906     case Op_RoundVD:
 1907       if (!VM_Version::supports_avx512dq()) {
 1908         return false;
 1909       }
 1910       break;
 1911     case Op_MulReductionVI:
 1912       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1913         return false;
 1914       }
 1915       break;
 1916     case Op_LoadVectorGatherMasked:
 1917       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1918         return false;
 1919       }
 1920       if (is_subword_type(bt) &&
 1921          (!is_LP64                                                ||
 1922          (size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1923          (size_in_bits < 64)                                      ||
 1924          (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1925         return false;
 1926       }
 1927       break;
 1928     case Op_StoreVectorScatterMasked:
 1929     case Op_StoreVectorScatter:
 1930       if (is_subword_type(bt)) {
 1931         return false;
 1932       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1933         return false;
 1934       }
 1935       // fallthrough
 1936     case Op_LoadVectorGather:
 1937       if (!is_subword_type(bt) && size_in_bits == 64) {
 1938         return false;
 1939       }
 1940       if (is_subword_type(bt) && size_in_bits < 64) {
 1941         return false;
 1942       }
 1943       break;
 1944     case Op_SaturatingAddV:
 1945     case Op_SaturatingSubV:
 1946       if (UseAVX < 1) {
 1947         return false; // Implementation limitation
 1948       }
 1949       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1950         return false;
 1951       }
 1952       break;
 1953     case Op_SelectFromTwoVector:
 1954        if (size_in_bits < 128 || (size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1955          return false;
 1956        }
 1957        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1958          return false;
 1959        }
 1960        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1961          return false;
 1962        }
 1963        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1964          return false;
 1965        }
 1966        break;
 1967     case Op_MaskAll:
 1968       if (!VM_Version::supports_evex()) {
 1969         return false;
 1970       }
 1971       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1972         return false;
 1973       }
 1974       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1975         return false;
 1976       }
 1977       break;
 1978     case Op_VectorMaskCmp:
 1979       if (vlen < 2 || size_in_bits < 32) {
 1980         return false;
 1981       }
 1982       break;
 1983     case Op_CompressM:
 1984       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1985         return false;
 1986       }
 1987       break;
 1988     case Op_CompressV:
 1989     case Op_ExpandV:
 1990       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1991         return false;
 1992       }
 1993       if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
 1994         return false;
 1995       }
 1996       if (size_in_bits < 128 ) {
 1997         return false;
 1998       }
 1999     case Op_VectorLongToMask:
 2000       if (UseAVX < 1 || !is_LP64) {
 2001         return false;
 2002       }
 2003       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 2004         return false;
 2005       }
 2006       break;
 2007     case Op_SignumVD:
 2008     case Op_SignumVF:
 2009       if (UseAVX < 1) {
 2010         return false;
 2011       }
 2012       break;
 2013     case Op_PopCountVI:
 2014     case Op_PopCountVL: {
 2015         if (!is_pop_count_instr_target(bt) &&
 2016             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 2017           return false;
 2018         }
 2019       }
 2020       break;
 2021     case Op_ReverseV:
 2022     case Op_ReverseBytesV:
 2023       if (UseAVX < 2) {
 2024         return false;
 2025       }
 2026       break;
 2027     case Op_CountTrailingZerosV:
 2028     case Op_CountLeadingZerosV:
 2029       if (UseAVX < 2) {
 2030         return false;
 2031       }
 2032       break;
 2033   }
 2034   return true;  // Per default match rules are supported.
 2035 }
 2036 
 2037 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2038   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2039   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2040   // of their non-masked counterpart with mask edge being the differentiator.
 2041   // This routine does a strict check on the existence of masked operation patterns
 2042   // by returning a default false value for all the other opcodes apart from the
 2043   // ones whose masked instruction patterns are defined in this file.
 2044   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2045     return false;
 2046   }
 2047 
 2048   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2049   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2050   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2051     return false;
 2052   }
 2053   switch(opcode) {
 2054     // Unary masked operations
 2055     case Op_AbsVB:
 2056     case Op_AbsVS:
 2057       if(!VM_Version::supports_avx512bw()) {
 2058         return false;  // Implementation limitation
 2059       }
 2060     case Op_AbsVI:
 2061     case Op_AbsVL:
 2062       return true;
 2063 
 2064     // Ternary masked operations
 2065     case Op_FmaVF:
 2066     case Op_FmaVD:
 2067       return true;
 2068 
 2069     case Op_MacroLogicV:
 2070       if(bt != T_INT && bt != T_LONG) {
 2071         return false;
 2072       }
 2073       return true;
 2074 
 2075     // Binary masked operations
 2076     case Op_AddVB:
 2077     case Op_AddVS:
 2078     case Op_SubVB:
 2079     case Op_SubVS:
 2080     case Op_MulVS:
 2081     case Op_LShiftVS:
 2082     case Op_RShiftVS:
 2083     case Op_URShiftVS:
 2084       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2085       if (!VM_Version::supports_avx512bw()) {
 2086         return false;  // Implementation limitation
 2087       }
 2088       return true;
 2089 
 2090     case Op_MulVL:
 2091       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2092       if (!VM_Version::supports_avx512dq()) {
 2093         return false;  // Implementation limitation
 2094       }
 2095       return true;
 2096 
 2097     case Op_AndV:
 2098     case Op_OrV:
 2099     case Op_XorV:
 2100     case Op_RotateRightV:
 2101     case Op_RotateLeftV:
 2102       if (bt != T_INT && bt != T_LONG) {
 2103         return false; // Implementation limitation
 2104       }
 2105       return true;
 2106 
 2107     case Op_VectorLoadMask:
 2108       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2109       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2110         return false;
 2111       }
 2112       return true;
 2113 
 2114     case Op_AddVI:
 2115     case Op_AddVL:
 2116     case Op_AddVF:
 2117     case Op_AddVD:
 2118     case Op_SubVI:
 2119     case Op_SubVL:
 2120     case Op_SubVF:
 2121     case Op_SubVD:
 2122     case Op_MulVI:
 2123     case Op_MulVF:
 2124     case Op_MulVD:
 2125     case Op_DivVF:
 2126     case Op_DivVD:
 2127     case Op_SqrtVF:
 2128     case Op_SqrtVD:
 2129     case Op_LShiftVI:
 2130     case Op_LShiftVL:
 2131     case Op_RShiftVI:
 2132     case Op_RShiftVL:
 2133     case Op_URShiftVI:
 2134     case Op_URShiftVL:
 2135     case Op_LoadVectorMasked:
 2136     case Op_StoreVectorMasked:
 2137     case Op_LoadVectorGatherMasked:
 2138     case Op_StoreVectorScatterMasked:
 2139       return true;
 2140 
 2141     case Op_UMinV:
 2142     case Op_UMaxV:
 2143       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2144         return false;
 2145       } // fallthrough
 2146     case Op_MaxV:
 2147     case Op_MinV:
 2148       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2149         return false; // Implementation limitation
 2150       }
 2151       if (is_floating_point_type(bt)) {
 2152         return false; // Implementation limitation
 2153       }
 2154       return true;
 2155     case Op_SaturatingAddV:
 2156     case Op_SaturatingSubV:
 2157       if (!is_subword_type(bt)) {
 2158         return false;
 2159       }
 2160       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2161         return false; // Implementation limitation
 2162       }
 2163       return true;
 2164 
 2165     case Op_VectorMaskCmp:
 2166       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2167         return false; // Implementation limitation
 2168       }
 2169       return true;
 2170 
 2171     case Op_VectorRearrange:
 2172       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2173         return false; // Implementation limitation
 2174       }
 2175       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2176         return false; // Implementation limitation
 2177       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2178         return false; // Implementation limitation
 2179       }
 2180       return true;
 2181 
 2182     // Binary Logical operations
 2183     case Op_AndVMask:
 2184     case Op_OrVMask:
 2185     case Op_XorVMask:
 2186       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2187         return false; // Implementation limitation
 2188       }
 2189       return true;
 2190 
 2191     case Op_PopCountVI:
 2192     case Op_PopCountVL:
 2193       if (!is_pop_count_instr_target(bt)) {
 2194         return false;
 2195       }
 2196       return true;
 2197 
 2198     case Op_MaskAll:
 2199       return true;
 2200 
 2201     case Op_CountLeadingZerosV:
 2202       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2203         return true;
 2204       }
 2205     default:
 2206       return false;
 2207   }
 2208 }
 2209 
 2210 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2211   return false;
 2212 }
 2213 
 2214 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2215 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2216   switch (elem_bt) {
 2217     case T_BYTE:  return false;
 2218     case T_SHORT: return !VM_Version::supports_avx512bw();
 2219     case T_INT:   return !VM_Version::supports_avx();
 2220     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2221     default:
 2222       ShouldNotReachHere();
 2223       return false;
 2224   }
 2225 }
 2226 
 2227 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2228   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2229   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2230   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2231       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2232     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2233     return new legVecZOper();
 2234   }
 2235   if (legacy) {
 2236     switch (ideal_reg) {
 2237       case Op_VecS: return new legVecSOper();
 2238       case Op_VecD: return new legVecDOper();
 2239       case Op_VecX: return new legVecXOper();
 2240       case Op_VecY: return new legVecYOper();
 2241       case Op_VecZ: return new legVecZOper();
 2242     }
 2243   } else {
 2244     switch (ideal_reg) {
 2245       case Op_VecS: return new vecSOper();
 2246       case Op_VecD: return new vecDOper();
 2247       case Op_VecX: return new vecXOper();
 2248       case Op_VecY: return new vecYOper();
 2249       case Op_VecZ: return new vecZOper();
 2250     }
 2251   }
 2252   ShouldNotReachHere();
 2253   return nullptr;
 2254 }
 2255 
 2256 bool Matcher::is_reg2reg_move(MachNode* m) {
 2257   switch (m->rule()) {
 2258     case MoveVec2Leg_rule:
 2259     case MoveLeg2Vec_rule:
 2260     case MoveF2VL_rule:
 2261     case MoveF2LEG_rule:
 2262     case MoveVL2F_rule:
 2263     case MoveLEG2F_rule:
 2264     case MoveD2VL_rule:
 2265     case MoveD2LEG_rule:
 2266     case MoveVL2D_rule:
 2267     case MoveLEG2D_rule:
 2268       return true;
 2269     default:
 2270       return false;
 2271   }
 2272 }
 2273 
 2274 bool Matcher::is_generic_vector(MachOper* opnd) {
 2275   switch (opnd->opcode()) {
 2276     case VEC:
 2277     case LEGVEC:
 2278       return true;
 2279     default:
 2280       return false;
 2281   }
 2282 }
 2283 
 2284 //------------------------------------------------------------------------
 2285 
 2286 const RegMask* Matcher::predicate_reg_mask(void) {
 2287   return &_VECTMASK_REG_mask;
 2288 }
 2289 
 2290 // Max vector size in bytes. 0 if not supported.
 2291 int Matcher::vector_width_in_bytes(BasicType bt) {
 2292   assert(is_java_primitive(bt), "only primitive type vectors");
 2293   if (UseSSE < 2) return 0;
 2294   // SSE2 supports 128bit vectors for all types.
 2295   // AVX2 supports 256bit vectors for all types.
 2296   // AVX2/EVEX supports 512bit vectors for all types.
 2297   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2298   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2299   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2300     size = (UseAVX > 2) ? 64 : 32;
 2301   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2302     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2303   // Use flag to limit vector size.
 2304   size = MIN2(size,(int)MaxVectorSize);
 2305   // Minimum 2 values in vector (or 4 for bytes).
 2306   switch (bt) {
 2307   case T_DOUBLE:
 2308   case T_LONG:
 2309     if (size < 16) return 0;
 2310     break;
 2311   case T_FLOAT:
 2312   case T_INT:
 2313     if (size < 8) return 0;
 2314     break;
 2315   case T_BOOLEAN:
 2316     if (size < 4) return 0;
 2317     break;
 2318   case T_CHAR:
 2319     if (size < 4) return 0;
 2320     break;
 2321   case T_BYTE:
 2322     if (size < 4) return 0;
 2323     break;
 2324   case T_SHORT:
 2325     if (size < 4) return 0;
 2326     break;
 2327   default:
 2328     ShouldNotReachHere();
 2329   }
 2330   return size;
 2331 }
 2332 
 2333 // Limits on vector size (number of elements) loaded into vector.
 2334 int Matcher::max_vector_size(const BasicType bt) {
 2335   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2336 }
 2337 int Matcher::min_vector_size(const BasicType bt) {
 2338   int max_size = max_vector_size(bt);
 2339   // Min size which can be loaded into vector is 4 bytes.
 2340   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2341   // Support for calling svml double64 vectors
 2342   if (bt == T_DOUBLE) {
 2343     size = 1;
 2344   }
 2345   return MIN2(size,max_size);
 2346 }
 2347 
 2348 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2349   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2350   // by default on Cascade Lake
 2351   if (VM_Version::is_default_intel_cascade_lake()) {
 2352     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2353   }
 2354   return Matcher::max_vector_size(bt);
 2355 }
 2356 
 2357 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2358   return -1;
 2359 }
 2360 
 2361 // Vector ideal reg corresponding to specified size in bytes
 2362 uint Matcher::vector_ideal_reg(int size) {
 2363   assert(MaxVectorSize >= size, "");
 2364   switch(size) {
 2365     case  4: return Op_VecS;
 2366     case  8: return Op_VecD;
 2367     case 16: return Op_VecX;
 2368     case 32: return Op_VecY;
 2369     case 64: return Op_VecZ;
 2370   }
 2371   ShouldNotReachHere();
 2372   return 0;
 2373 }
 2374 
 2375 // Check for shift by small constant as well
 2376 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2377   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2378       shift->in(2)->get_int() <= 3 &&
 2379       // Are there other uses besides address expressions?
 2380       !matcher->is_visited(shift)) {
 2381     address_visited.set(shift->_idx); // Flag as address_visited
 2382     mstack.push(shift->in(2), Matcher::Visit);
 2383     Node *conv = shift->in(1);
 2384 #ifdef _LP64
 2385     // Allow Matcher to match the rule which bypass
 2386     // ConvI2L operation for an array index on LP64
 2387     // if the index value is positive.
 2388     if (conv->Opcode() == Op_ConvI2L &&
 2389         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2390         // Are there other uses besides address expressions?
 2391         !matcher->is_visited(conv)) {
 2392       address_visited.set(conv->_idx); // Flag as address_visited
 2393       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2394     } else
 2395 #endif
 2396       mstack.push(conv, Matcher::Pre_Visit);
 2397     return true;
 2398   }
 2399   return false;
 2400 }
 2401 
 2402 // This function identifies sub-graphs in which a 'load' node is
 2403 // input to two different nodes, and such that it can be matched
 2404 // with BMI instructions like blsi, blsr, etc.
 2405 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2406 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2407 // refers to the same node.
 2408 //
 2409 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2410 // This is a temporary solution until we make DAGs expressible in ADL.
 2411 template<typename ConType>
 2412 class FusedPatternMatcher {
 2413   Node* _op1_node;
 2414   Node* _mop_node;
 2415   int _con_op;
 2416 
 2417   static int match_next(Node* n, int next_op, int next_op_idx) {
 2418     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2419       return -1;
 2420     }
 2421 
 2422     if (next_op_idx == -1) { // n is commutative, try rotations
 2423       if (n->in(1)->Opcode() == next_op) {
 2424         return 1;
 2425       } else if (n->in(2)->Opcode() == next_op) {
 2426         return 2;
 2427       }
 2428     } else {
 2429       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2430       if (n->in(next_op_idx)->Opcode() == next_op) {
 2431         return next_op_idx;
 2432       }
 2433     }
 2434     return -1;
 2435   }
 2436 
 2437  public:
 2438   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2439     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2440 
 2441   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2442              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2443              typename ConType::NativeType con_value) {
 2444     if (_op1_node->Opcode() != op1) {
 2445       return false;
 2446     }
 2447     if (_mop_node->outcnt() > 2) {
 2448       return false;
 2449     }
 2450     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2451     if (op1_op2_idx == -1) {
 2452       return false;
 2453     }
 2454     // Memory operation must be the other edge
 2455     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2456 
 2457     // Check that the mop node is really what we want
 2458     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2459       Node* op2_node = _op1_node->in(op1_op2_idx);
 2460       if (op2_node->outcnt() > 1) {
 2461         return false;
 2462       }
 2463       assert(op2_node->Opcode() == op2, "Should be");
 2464       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2465       if (op2_con_idx == -1) {
 2466         return false;
 2467       }
 2468       // Memory operation must be the other edge
 2469       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2470       // Check that the memory operation is the same node
 2471       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2472         // Now check the constant
 2473         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2474         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2475           return true;
 2476         }
 2477       }
 2478     }
 2479     return false;
 2480   }
 2481 };
 2482 
 2483 static bool is_bmi_pattern(Node* n, Node* m) {
 2484   assert(UseBMI1Instructions, "sanity");
 2485   if (n != nullptr && m != nullptr) {
 2486     if (m->Opcode() == Op_LoadI) {
 2487       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2488       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2489              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2490              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2491     } else if (m->Opcode() == Op_LoadL) {
 2492       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2493       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2494              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2495              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2496     }
 2497   }
 2498   return false;
 2499 }
 2500 
 2501 // Should the matcher clone input 'm' of node 'n'?
 2502 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2503   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2504   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2505     mstack.push(m, Visit);
 2506     return true;
 2507   }
 2508   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2509     mstack.push(m, Visit);           // m = ShiftCntV
 2510     return true;
 2511   }
 2512   if (is_encode_and_store_pattern(n, m)) {
 2513     mstack.push(m, Visit);
 2514     return true;
 2515   }
 2516   return false;
 2517 }
 2518 
 2519 // Should the Matcher clone shifts on addressing modes, expecting them
 2520 // to be subsumed into complex addressing expressions or compute them
 2521 // into registers?
 2522 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2523   Node *off = m->in(AddPNode::Offset);
 2524   if (off->is_Con()) {
 2525     address_visited.test_set(m->_idx); // Flag as address_visited
 2526     Node *adr = m->in(AddPNode::Address);
 2527 
 2528     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2529     // AtomicAdd is not an addressing expression.
 2530     // Cheap to find it by looking for screwy base.
 2531     if (adr->is_AddP() &&
 2532         !adr->in(AddPNode::Base)->is_top() &&
 2533         !adr->in(AddPNode::Offset)->is_Con() &&
 2534         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2535         // Are there other uses besides address expressions?
 2536         !is_visited(adr)) {
 2537       address_visited.set(adr->_idx); // Flag as address_visited
 2538       Node *shift = adr->in(AddPNode::Offset);
 2539       if (!clone_shift(shift, this, mstack, address_visited)) {
 2540         mstack.push(shift, Pre_Visit);
 2541       }
 2542       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2543       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2544     } else {
 2545       mstack.push(adr, Pre_Visit);
 2546     }
 2547 
 2548     // Clone X+offset as it also folds into most addressing expressions
 2549     mstack.push(off, Visit);
 2550     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2551     return true;
 2552   } else if (clone_shift(off, this, mstack, address_visited)) {
 2553     address_visited.test_set(m->_idx); // Flag as address_visited
 2554     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2555     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2556     return true;
 2557   }
 2558   return false;
 2559 }
 2560 
 2561 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2562   switch (bt) {
 2563     case BoolTest::eq:
 2564       return Assembler::eq;
 2565     case BoolTest::ne:
 2566       return Assembler::neq;
 2567     case BoolTest::le:
 2568     case BoolTest::ule:
 2569       return Assembler::le;
 2570     case BoolTest::ge:
 2571     case BoolTest::uge:
 2572       return Assembler::nlt;
 2573     case BoolTest::lt:
 2574     case BoolTest::ult:
 2575       return Assembler::lt;
 2576     case BoolTest::gt:
 2577     case BoolTest::ugt:
 2578       return Assembler::nle;
 2579     default : ShouldNotReachHere(); return Assembler::_false;
 2580   }
 2581 }
 2582 
 2583 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2584   switch (bt) {
 2585   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2586   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2587   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2588   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2589   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2590   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2591   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2592   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2593   }
 2594 }
 2595 
 2596 // Helper methods for MachSpillCopyNode::implementation().
 2597 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2598                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2599   assert(ireg == Op_VecS || // 32bit vector
 2600          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2601           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2602          "no non-adjacent vector moves" );
 2603   if (masm) {
 2604     switch (ireg) {
 2605     case Op_VecS: // copy whole register
 2606     case Op_VecD:
 2607     case Op_VecX:
 2608 #ifndef _LP64
 2609       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2610 #else
 2611       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2612         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2613       } else {
 2614         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2615      }
 2616 #endif
 2617       break;
 2618     case Op_VecY:
 2619 #ifndef _LP64
 2620       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2621 #else
 2622       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2623         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2624       } else {
 2625         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2626      }
 2627 #endif
 2628       break;
 2629     case Op_VecZ:
 2630       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2631       break;
 2632     default:
 2633       ShouldNotReachHere();
 2634     }
 2635 #ifndef PRODUCT
 2636   } else {
 2637     switch (ireg) {
 2638     case Op_VecS:
 2639     case Op_VecD:
 2640     case Op_VecX:
 2641       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2642       break;
 2643     case Op_VecY:
 2644     case Op_VecZ:
 2645       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2646       break;
 2647     default:
 2648       ShouldNotReachHere();
 2649     }
 2650 #endif
 2651   }
 2652 }
 2653 
 2654 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2655                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2656   if (masm) {
 2657     if (is_load) {
 2658       switch (ireg) {
 2659       case Op_VecS:
 2660         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2661         break;
 2662       case Op_VecD:
 2663         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2664         break;
 2665       case Op_VecX:
 2666 #ifndef _LP64
 2667         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2668 #else
 2669         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2670           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2671         } else {
 2672           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2673           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2674         }
 2675 #endif
 2676         break;
 2677       case Op_VecY:
 2678 #ifndef _LP64
 2679         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2680 #else
 2681         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2682           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2683         } else {
 2684           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2685           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2686         }
 2687 #endif
 2688         break;
 2689       case Op_VecZ:
 2690         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2691         break;
 2692       default:
 2693         ShouldNotReachHere();
 2694       }
 2695     } else { // store
 2696       switch (ireg) {
 2697       case Op_VecS:
 2698         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2699         break;
 2700       case Op_VecD:
 2701         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2702         break;
 2703       case Op_VecX:
 2704 #ifndef _LP64
 2705         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2706 #else
 2707         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2708           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2709         }
 2710         else {
 2711           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2712         }
 2713 #endif
 2714         break;
 2715       case Op_VecY:
 2716 #ifndef _LP64
 2717         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2718 #else
 2719         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2720           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2721         }
 2722         else {
 2723           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2724         }
 2725 #endif
 2726         break;
 2727       case Op_VecZ:
 2728         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2729         break;
 2730       default:
 2731         ShouldNotReachHere();
 2732       }
 2733     }
 2734 #ifndef PRODUCT
 2735   } else {
 2736     if (is_load) {
 2737       switch (ireg) {
 2738       case Op_VecS:
 2739         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2740         break;
 2741       case Op_VecD:
 2742         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2743         break;
 2744        case Op_VecX:
 2745         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2746         break;
 2747       case Op_VecY:
 2748       case Op_VecZ:
 2749         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2750         break;
 2751       default:
 2752         ShouldNotReachHere();
 2753       }
 2754     } else { // store
 2755       switch (ireg) {
 2756       case Op_VecS:
 2757         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2758         break;
 2759       case Op_VecD:
 2760         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2761         break;
 2762        case Op_VecX:
 2763         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2764         break;
 2765       case Op_VecY:
 2766       case Op_VecZ:
 2767         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2768         break;
 2769       default:
 2770         ShouldNotReachHere();
 2771       }
 2772     }
 2773 #endif
 2774   }
 2775 }
 2776 
 2777 template <class T>
 2778 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2779   int size = type2aelembytes(bt) * len;
 2780   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2781   for (int i = 0; i < len; i++) {
 2782     int offset = i * type2aelembytes(bt);
 2783     switch (bt) {
 2784       case T_BYTE: val->at(i) = con; break;
 2785       case T_SHORT: {
 2786         jshort c = con;
 2787         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2788         break;
 2789       }
 2790       case T_INT: {
 2791         jint c = con;
 2792         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2793         break;
 2794       }
 2795       case T_LONG: {
 2796         jlong c = con;
 2797         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2798         break;
 2799       }
 2800       case T_FLOAT: {
 2801         jfloat c = con;
 2802         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2803         break;
 2804       }
 2805       case T_DOUBLE: {
 2806         jdouble c = con;
 2807         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2808         break;
 2809       }
 2810       default: assert(false, "%s", type2name(bt));
 2811     }
 2812   }
 2813   return val;
 2814 }
 2815 
 2816 static inline jlong high_bit_set(BasicType bt) {
 2817   switch (bt) {
 2818     case T_BYTE:  return 0x8080808080808080;
 2819     case T_SHORT: return 0x8000800080008000;
 2820     case T_INT:   return 0x8000000080000000;
 2821     case T_LONG:  return 0x8000000000000000;
 2822     default:
 2823       ShouldNotReachHere();
 2824       return 0;
 2825   }
 2826 }
 2827 
 2828 #ifndef PRODUCT
 2829   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2830     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2831   }
 2832 #endif
 2833 
 2834   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2835     __ nop(_count);
 2836   }
 2837 
 2838   uint MachNopNode::size(PhaseRegAlloc*) const {
 2839     return _count;
 2840   }
 2841 
 2842 #ifndef PRODUCT
 2843   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2844     st->print("# breakpoint");
 2845   }
 2846 #endif
 2847 
 2848   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2849     __ int3();
 2850   }
 2851 
 2852   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2853     return MachNode::size(ra_);
 2854   }
 2855 
 2856 %}
 2857 
 2858 encode %{
 2859 
 2860   enc_class call_epilog %{
 2861     if (VerifyStackAtCalls) {
 2862       // Check that stack depth is unchanged: find majik cookie on stack
 2863       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2864       Label L;
 2865       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2866       __ jccb(Assembler::equal, L);
 2867       // Die if stack mismatch
 2868       __ int3();
 2869       __ bind(L);
 2870     }
 2871     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2872       // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2873       // Search for the corresponding projection, get the register and emit code that initialized it.
 2874       uint con = (tf()->range_cc()->cnt() - 1);
 2875       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2876         ProjNode* proj = fast_out(i)->as_Proj();
 2877         if (proj->_con == con) {
 2878           // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2879           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2880           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2881           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2882           __ testq(rax, rax);
 2883           __ setb(Assembler::notZero, toReg);
 2884           __ movzbl(toReg, toReg);
 2885           if (reg->is_stack()) {
 2886             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2887             __ movq(Address(rsp, st_off), toReg);
 2888           }
 2889           break;
 2890         }
 2891       }
 2892       if (return_value_is_used()) {
 2893         // An inline type is returned as fields in multiple registers.
 2894         // Rax either contains an oop if the inline type is buffered or a pointer
 2895         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2896         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2897         // rax &= (rax & 1) - 1
 2898         __ movptr(rscratch1, rax);
 2899         __ andptr(rscratch1, 0x1);
 2900         __ subptr(rscratch1, 0x1);
 2901         __ andptr(rax, rscratch1);
 2902       }
 2903     }
 2904   %}
 2905 
 2906 %}
 2907 
 2908 // Operands for bound floating pointer register arguments
 2909 operand rxmm0() %{
 2910   constraint(ALLOC_IN_RC(xmm0_reg));
 2911   match(VecX);
 2912   format%{%}
 2913   interface(REG_INTER);
 2914 %}
 2915 
 2916 //----------OPERANDS-----------------------------------------------------------
 2917 // Operand definitions must precede instruction definitions for correct parsing
 2918 // in the ADLC because operands constitute user defined types which are used in
 2919 // instruction definitions.
 2920 
 2921 // Vectors
 2922 
 2923 // Dummy generic vector class. Should be used for all vector operands.
 2924 // Replaced with vec[SDXYZ] during post-selection pass.
 2925 operand vec() %{
 2926   constraint(ALLOC_IN_RC(dynamic));
 2927   match(VecX);
 2928   match(VecY);
 2929   match(VecZ);
 2930   match(VecS);
 2931   match(VecD);
 2932 
 2933   format %{ %}
 2934   interface(REG_INTER);
 2935 %}
 2936 
 2937 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2938 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2939 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2940 // runtime code generation via reg_class_dynamic.
 2941 operand legVec() %{
 2942   constraint(ALLOC_IN_RC(dynamic));
 2943   match(VecX);
 2944   match(VecY);
 2945   match(VecZ);
 2946   match(VecS);
 2947   match(VecD);
 2948 
 2949   format %{ %}
 2950   interface(REG_INTER);
 2951 %}
 2952 
 2953 // Replaces vec during post-selection cleanup. See above.
 2954 operand vecS() %{
 2955   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2956   match(VecS);
 2957 
 2958   format %{ %}
 2959   interface(REG_INTER);
 2960 %}
 2961 
 2962 // Replaces legVec during post-selection cleanup. See above.
 2963 operand legVecS() %{
 2964   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2965   match(VecS);
 2966 
 2967   format %{ %}
 2968   interface(REG_INTER);
 2969 %}
 2970 
 2971 // Replaces vec during post-selection cleanup. See above.
 2972 operand vecD() %{
 2973   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2974   match(VecD);
 2975 
 2976   format %{ %}
 2977   interface(REG_INTER);
 2978 %}
 2979 
 2980 // Replaces legVec during post-selection cleanup. See above.
 2981 operand legVecD() %{
 2982   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2983   match(VecD);
 2984 
 2985   format %{ %}
 2986   interface(REG_INTER);
 2987 %}
 2988 
 2989 // Replaces vec during post-selection cleanup. See above.
 2990 operand vecX() %{
 2991   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2992   match(VecX);
 2993 
 2994   format %{ %}
 2995   interface(REG_INTER);
 2996 %}
 2997 
 2998 // Replaces legVec during post-selection cleanup. See above.
 2999 operand legVecX() %{
 3000   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 3001   match(VecX);
 3002 
 3003   format %{ %}
 3004   interface(REG_INTER);
 3005 %}
 3006 
 3007 // Replaces vec during post-selection cleanup. See above.
 3008 operand vecY() %{
 3009   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 3010   match(VecY);
 3011 
 3012   format %{ %}
 3013   interface(REG_INTER);
 3014 %}
 3015 
 3016 // Replaces legVec during post-selection cleanup. See above.
 3017 operand legVecY() %{
 3018   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 3019   match(VecY);
 3020 
 3021   format %{ %}
 3022   interface(REG_INTER);
 3023 %}
 3024 
 3025 // Replaces vec during post-selection cleanup. See above.
 3026 operand vecZ() %{
 3027   constraint(ALLOC_IN_RC(vectorz_reg));
 3028   match(VecZ);
 3029 
 3030   format %{ %}
 3031   interface(REG_INTER);
 3032 %}
 3033 
 3034 // Replaces legVec during post-selection cleanup. See above.
 3035 operand legVecZ() %{
 3036   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 3037   match(VecZ);
 3038 
 3039   format %{ %}
 3040   interface(REG_INTER);
 3041 %}
 3042 
 3043 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 3044 
 3045 // ============================================================================
 3046 
 3047 instruct ShouldNotReachHere() %{
 3048   match(Halt);
 3049   format %{ "stop\t# ShouldNotReachHere" %}
 3050   ins_encode %{
 3051     if (is_reachable()) {
 3052       __ stop(_halt_reason);
 3053     }
 3054   %}
 3055   ins_pipe(pipe_slow);
 3056 %}
 3057 
 3058 // ============================================================================
 3059 
 3060 instruct addF_reg(regF dst, regF src) %{
 3061   predicate((UseSSE>=1) && (UseAVX == 0));
 3062   match(Set dst (AddF dst src));
 3063 
 3064   format %{ "addss   $dst, $src" %}
 3065   ins_cost(150);
 3066   ins_encode %{
 3067     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3068   %}
 3069   ins_pipe(pipe_slow);
 3070 %}
 3071 
 3072 instruct addF_mem(regF dst, memory src) %{
 3073   predicate((UseSSE>=1) && (UseAVX == 0));
 3074   match(Set dst (AddF dst (LoadF src)));
 3075 
 3076   format %{ "addss   $dst, $src" %}
 3077   ins_cost(150);
 3078   ins_encode %{
 3079     __ addss($dst$$XMMRegister, $src$$Address);
 3080   %}
 3081   ins_pipe(pipe_slow);
 3082 %}
 3083 
 3084 instruct addF_imm(regF dst, immF con) %{
 3085   predicate((UseSSE>=1) && (UseAVX == 0));
 3086   match(Set dst (AddF dst con));
 3087   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3088   ins_cost(150);
 3089   ins_encode %{
 3090     __ addss($dst$$XMMRegister, $constantaddress($con));
 3091   %}
 3092   ins_pipe(pipe_slow);
 3093 %}
 3094 
 3095 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3096   predicate(UseAVX > 0);
 3097   match(Set dst (AddF src1 src2));
 3098 
 3099   format %{ "vaddss  $dst, $src1, $src2" %}
 3100   ins_cost(150);
 3101   ins_encode %{
 3102     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3103   %}
 3104   ins_pipe(pipe_slow);
 3105 %}
 3106 
 3107 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3108   predicate(UseAVX > 0);
 3109   match(Set dst (AddF src1 (LoadF src2)));
 3110 
 3111   format %{ "vaddss  $dst, $src1, $src2" %}
 3112   ins_cost(150);
 3113   ins_encode %{
 3114     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3115   %}
 3116   ins_pipe(pipe_slow);
 3117 %}
 3118 
 3119 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3120   predicate(UseAVX > 0);
 3121   match(Set dst (AddF src con));
 3122 
 3123   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3124   ins_cost(150);
 3125   ins_encode %{
 3126     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3127   %}
 3128   ins_pipe(pipe_slow);
 3129 %}
 3130 
 3131 instruct addD_reg(regD dst, regD src) %{
 3132   predicate((UseSSE>=2) && (UseAVX == 0));
 3133   match(Set dst (AddD dst src));
 3134 
 3135   format %{ "addsd   $dst, $src" %}
 3136   ins_cost(150);
 3137   ins_encode %{
 3138     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3139   %}
 3140   ins_pipe(pipe_slow);
 3141 %}
 3142 
 3143 instruct addD_mem(regD dst, memory src) %{
 3144   predicate((UseSSE>=2) && (UseAVX == 0));
 3145   match(Set dst (AddD dst (LoadD src)));
 3146 
 3147   format %{ "addsd   $dst, $src" %}
 3148   ins_cost(150);
 3149   ins_encode %{
 3150     __ addsd($dst$$XMMRegister, $src$$Address);
 3151   %}
 3152   ins_pipe(pipe_slow);
 3153 %}
 3154 
 3155 instruct addD_imm(regD dst, immD con) %{
 3156   predicate((UseSSE>=2) && (UseAVX == 0));
 3157   match(Set dst (AddD dst con));
 3158   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3159   ins_cost(150);
 3160   ins_encode %{
 3161     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3162   %}
 3163   ins_pipe(pipe_slow);
 3164 %}
 3165 
 3166 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3167   predicate(UseAVX > 0);
 3168   match(Set dst (AddD src1 src2));
 3169 
 3170   format %{ "vaddsd  $dst, $src1, $src2" %}
 3171   ins_cost(150);
 3172   ins_encode %{
 3173     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3174   %}
 3175   ins_pipe(pipe_slow);
 3176 %}
 3177 
 3178 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3179   predicate(UseAVX > 0);
 3180   match(Set dst (AddD src1 (LoadD src2)));
 3181 
 3182   format %{ "vaddsd  $dst, $src1, $src2" %}
 3183   ins_cost(150);
 3184   ins_encode %{
 3185     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3186   %}
 3187   ins_pipe(pipe_slow);
 3188 %}
 3189 
 3190 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3191   predicate(UseAVX > 0);
 3192   match(Set dst (AddD src con));
 3193 
 3194   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3195   ins_cost(150);
 3196   ins_encode %{
 3197     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3198   %}
 3199   ins_pipe(pipe_slow);
 3200 %}
 3201 
 3202 instruct subF_reg(regF dst, regF src) %{
 3203   predicate((UseSSE>=1) && (UseAVX == 0));
 3204   match(Set dst (SubF dst src));
 3205 
 3206   format %{ "subss   $dst, $src" %}
 3207   ins_cost(150);
 3208   ins_encode %{
 3209     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3210   %}
 3211   ins_pipe(pipe_slow);
 3212 %}
 3213 
 3214 instruct subF_mem(regF dst, memory src) %{
 3215   predicate((UseSSE>=1) && (UseAVX == 0));
 3216   match(Set dst (SubF dst (LoadF src)));
 3217 
 3218   format %{ "subss   $dst, $src" %}
 3219   ins_cost(150);
 3220   ins_encode %{
 3221     __ subss($dst$$XMMRegister, $src$$Address);
 3222   %}
 3223   ins_pipe(pipe_slow);
 3224 %}
 3225 
 3226 instruct subF_imm(regF dst, immF con) %{
 3227   predicate((UseSSE>=1) && (UseAVX == 0));
 3228   match(Set dst (SubF dst con));
 3229   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3230   ins_cost(150);
 3231   ins_encode %{
 3232     __ subss($dst$$XMMRegister, $constantaddress($con));
 3233   %}
 3234   ins_pipe(pipe_slow);
 3235 %}
 3236 
 3237 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3238   predicate(UseAVX > 0);
 3239   match(Set dst (SubF src1 src2));
 3240 
 3241   format %{ "vsubss  $dst, $src1, $src2" %}
 3242   ins_cost(150);
 3243   ins_encode %{
 3244     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3245   %}
 3246   ins_pipe(pipe_slow);
 3247 %}
 3248 
 3249 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3250   predicate(UseAVX > 0);
 3251   match(Set dst (SubF src1 (LoadF src2)));
 3252 
 3253   format %{ "vsubss  $dst, $src1, $src2" %}
 3254   ins_cost(150);
 3255   ins_encode %{
 3256     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3257   %}
 3258   ins_pipe(pipe_slow);
 3259 %}
 3260 
 3261 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3262   predicate(UseAVX > 0);
 3263   match(Set dst (SubF src con));
 3264 
 3265   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3266   ins_cost(150);
 3267   ins_encode %{
 3268     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3269   %}
 3270   ins_pipe(pipe_slow);
 3271 %}
 3272 
 3273 instruct subD_reg(regD dst, regD src) %{
 3274   predicate((UseSSE>=2) && (UseAVX == 0));
 3275   match(Set dst (SubD dst src));
 3276 
 3277   format %{ "subsd   $dst, $src" %}
 3278   ins_cost(150);
 3279   ins_encode %{
 3280     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3281   %}
 3282   ins_pipe(pipe_slow);
 3283 %}
 3284 
 3285 instruct subD_mem(regD dst, memory src) %{
 3286   predicate((UseSSE>=2) && (UseAVX == 0));
 3287   match(Set dst (SubD dst (LoadD src)));
 3288 
 3289   format %{ "subsd   $dst, $src" %}
 3290   ins_cost(150);
 3291   ins_encode %{
 3292     __ subsd($dst$$XMMRegister, $src$$Address);
 3293   %}
 3294   ins_pipe(pipe_slow);
 3295 %}
 3296 
 3297 instruct subD_imm(regD dst, immD con) %{
 3298   predicate((UseSSE>=2) && (UseAVX == 0));
 3299   match(Set dst (SubD dst con));
 3300   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3301   ins_cost(150);
 3302   ins_encode %{
 3303     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3304   %}
 3305   ins_pipe(pipe_slow);
 3306 %}
 3307 
 3308 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3309   predicate(UseAVX > 0);
 3310   match(Set dst (SubD src1 src2));
 3311 
 3312   format %{ "vsubsd  $dst, $src1, $src2" %}
 3313   ins_cost(150);
 3314   ins_encode %{
 3315     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3316   %}
 3317   ins_pipe(pipe_slow);
 3318 %}
 3319 
 3320 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3321   predicate(UseAVX > 0);
 3322   match(Set dst (SubD src1 (LoadD src2)));
 3323 
 3324   format %{ "vsubsd  $dst, $src1, $src2" %}
 3325   ins_cost(150);
 3326   ins_encode %{
 3327     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3328   %}
 3329   ins_pipe(pipe_slow);
 3330 %}
 3331 
 3332 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3333   predicate(UseAVX > 0);
 3334   match(Set dst (SubD src con));
 3335 
 3336   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3337   ins_cost(150);
 3338   ins_encode %{
 3339     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3340   %}
 3341   ins_pipe(pipe_slow);
 3342 %}
 3343 
 3344 instruct mulF_reg(regF dst, regF src) %{
 3345   predicate((UseSSE>=1) && (UseAVX == 0));
 3346   match(Set dst (MulF dst src));
 3347 
 3348   format %{ "mulss   $dst, $src" %}
 3349   ins_cost(150);
 3350   ins_encode %{
 3351     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3352   %}
 3353   ins_pipe(pipe_slow);
 3354 %}
 3355 
 3356 instruct mulF_mem(regF dst, memory src) %{
 3357   predicate((UseSSE>=1) && (UseAVX == 0));
 3358   match(Set dst (MulF dst (LoadF src)));
 3359 
 3360   format %{ "mulss   $dst, $src" %}
 3361   ins_cost(150);
 3362   ins_encode %{
 3363     __ mulss($dst$$XMMRegister, $src$$Address);
 3364   %}
 3365   ins_pipe(pipe_slow);
 3366 %}
 3367 
 3368 instruct mulF_imm(regF dst, immF con) %{
 3369   predicate((UseSSE>=1) && (UseAVX == 0));
 3370   match(Set dst (MulF dst con));
 3371   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3372   ins_cost(150);
 3373   ins_encode %{
 3374     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3375   %}
 3376   ins_pipe(pipe_slow);
 3377 %}
 3378 
 3379 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3380   predicate(UseAVX > 0);
 3381   match(Set dst (MulF src1 src2));
 3382 
 3383   format %{ "vmulss  $dst, $src1, $src2" %}
 3384   ins_cost(150);
 3385   ins_encode %{
 3386     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3387   %}
 3388   ins_pipe(pipe_slow);
 3389 %}
 3390 
 3391 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3392   predicate(UseAVX > 0);
 3393   match(Set dst (MulF src1 (LoadF src2)));
 3394 
 3395   format %{ "vmulss  $dst, $src1, $src2" %}
 3396   ins_cost(150);
 3397   ins_encode %{
 3398     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3399   %}
 3400   ins_pipe(pipe_slow);
 3401 %}
 3402 
 3403 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3404   predicate(UseAVX > 0);
 3405   match(Set dst (MulF src con));
 3406 
 3407   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3408   ins_cost(150);
 3409   ins_encode %{
 3410     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3411   %}
 3412   ins_pipe(pipe_slow);
 3413 %}
 3414 
 3415 instruct mulD_reg(regD dst, regD src) %{
 3416   predicate((UseSSE>=2) && (UseAVX == 0));
 3417   match(Set dst (MulD dst src));
 3418 
 3419   format %{ "mulsd   $dst, $src" %}
 3420   ins_cost(150);
 3421   ins_encode %{
 3422     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3423   %}
 3424   ins_pipe(pipe_slow);
 3425 %}
 3426 
 3427 instruct mulD_mem(regD dst, memory src) %{
 3428   predicate((UseSSE>=2) && (UseAVX == 0));
 3429   match(Set dst (MulD dst (LoadD src)));
 3430 
 3431   format %{ "mulsd   $dst, $src" %}
 3432   ins_cost(150);
 3433   ins_encode %{
 3434     __ mulsd($dst$$XMMRegister, $src$$Address);
 3435   %}
 3436   ins_pipe(pipe_slow);
 3437 %}
 3438 
 3439 instruct mulD_imm(regD dst, immD con) %{
 3440   predicate((UseSSE>=2) && (UseAVX == 0));
 3441   match(Set dst (MulD dst con));
 3442   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3443   ins_cost(150);
 3444   ins_encode %{
 3445     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3446   %}
 3447   ins_pipe(pipe_slow);
 3448 %}
 3449 
 3450 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3451   predicate(UseAVX > 0);
 3452   match(Set dst (MulD src1 src2));
 3453 
 3454   format %{ "vmulsd  $dst, $src1, $src2" %}
 3455   ins_cost(150);
 3456   ins_encode %{
 3457     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3458   %}
 3459   ins_pipe(pipe_slow);
 3460 %}
 3461 
 3462 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3463   predicate(UseAVX > 0);
 3464   match(Set dst (MulD src1 (LoadD src2)));
 3465 
 3466   format %{ "vmulsd  $dst, $src1, $src2" %}
 3467   ins_cost(150);
 3468   ins_encode %{
 3469     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3470   %}
 3471   ins_pipe(pipe_slow);
 3472 %}
 3473 
 3474 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3475   predicate(UseAVX > 0);
 3476   match(Set dst (MulD src con));
 3477 
 3478   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3479   ins_cost(150);
 3480   ins_encode %{
 3481     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3482   %}
 3483   ins_pipe(pipe_slow);
 3484 %}
 3485 
 3486 instruct divF_reg(regF dst, regF src) %{
 3487   predicate((UseSSE>=1) && (UseAVX == 0));
 3488   match(Set dst (DivF dst src));
 3489 
 3490   format %{ "divss   $dst, $src" %}
 3491   ins_cost(150);
 3492   ins_encode %{
 3493     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3494   %}
 3495   ins_pipe(pipe_slow);
 3496 %}
 3497 
 3498 instruct divF_mem(regF dst, memory src) %{
 3499   predicate((UseSSE>=1) && (UseAVX == 0));
 3500   match(Set dst (DivF dst (LoadF src)));
 3501 
 3502   format %{ "divss   $dst, $src" %}
 3503   ins_cost(150);
 3504   ins_encode %{
 3505     __ divss($dst$$XMMRegister, $src$$Address);
 3506   %}
 3507   ins_pipe(pipe_slow);
 3508 %}
 3509 
 3510 instruct divF_imm(regF dst, immF con) %{
 3511   predicate((UseSSE>=1) && (UseAVX == 0));
 3512   match(Set dst (DivF dst con));
 3513   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3514   ins_cost(150);
 3515   ins_encode %{
 3516     __ divss($dst$$XMMRegister, $constantaddress($con));
 3517   %}
 3518   ins_pipe(pipe_slow);
 3519 %}
 3520 
 3521 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3522   predicate(UseAVX > 0);
 3523   match(Set dst (DivF src1 src2));
 3524 
 3525   format %{ "vdivss  $dst, $src1, $src2" %}
 3526   ins_cost(150);
 3527   ins_encode %{
 3528     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3529   %}
 3530   ins_pipe(pipe_slow);
 3531 %}
 3532 
 3533 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3534   predicate(UseAVX > 0);
 3535   match(Set dst (DivF src1 (LoadF src2)));
 3536 
 3537   format %{ "vdivss  $dst, $src1, $src2" %}
 3538   ins_cost(150);
 3539   ins_encode %{
 3540     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3541   %}
 3542   ins_pipe(pipe_slow);
 3543 %}
 3544 
 3545 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3546   predicate(UseAVX > 0);
 3547   match(Set dst (DivF src con));
 3548 
 3549   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3550   ins_cost(150);
 3551   ins_encode %{
 3552     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3553   %}
 3554   ins_pipe(pipe_slow);
 3555 %}
 3556 
 3557 instruct divD_reg(regD dst, regD src) %{
 3558   predicate((UseSSE>=2) && (UseAVX == 0));
 3559   match(Set dst (DivD dst src));
 3560 
 3561   format %{ "divsd   $dst, $src" %}
 3562   ins_cost(150);
 3563   ins_encode %{
 3564     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3565   %}
 3566   ins_pipe(pipe_slow);
 3567 %}
 3568 
 3569 instruct divD_mem(regD dst, memory src) %{
 3570   predicate((UseSSE>=2) && (UseAVX == 0));
 3571   match(Set dst (DivD dst (LoadD src)));
 3572 
 3573   format %{ "divsd   $dst, $src" %}
 3574   ins_cost(150);
 3575   ins_encode %{
 3576     __ divsd($dst$$XMMRegister, $src$$Address);
 3577   %}
 3578   ins_pipe(pipe_slow);
 3579 %}
 3580 
 3581 instruct divD_imm(regD dst, immD con) %{
 3582   predicate((UseSSE>=2) && (UseAVX == 0));
 3583   match(Set dst (DivD dst con));
 3584   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3585   ins_cost(150);
 3586   ins_encode %{
 3587     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3588   %}
 3589   ins_pipe(pipe_slow);
 3590 %}
 3591 
 3592 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3593   predicate(UseAVX > 0);
 3594   match(Set dst (DivD src1 src2));
 3595 
 3596   format %{ "vdivsd  $dst, $src1, $src2" %}
 3597   ins_cost(150);
 3598   ins_encode %{
 3599     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3600   %}
 3601   ins_pipe(pipe_slow);
 3602 %}
 3603 
 3604 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3605   predicate(UseAVX > 0);
 3606   match(Set dst (DivD src1 (LoadD src2)));
 3607 
 3608   format %{ "vdivsd  $dst, $src1, $src2" %}
 3609   ins_cost(150);
 3610   ins_encode %{
 3611     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3612   %}
 3613   ins_pipe(pipe_slow);
 3614 %}
 3615 
 3616 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3617   predicate(UseAVX > 0);
 3618   match(Set dst (DivD src con));
 3619 
 3620   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3621   ins_cost(150);
 3622   ins_encode %{
 3623     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3624   %}
 3625   ins_pipe(pipe_slow);
 3626 %}
 3627 
 3628 instruct absF_reg(regF dst) %{
 3629   predicate((UseSSE>=1) && (UseAVX == 0));
 3630   match(Set dst (AbsF dst));
 3631   ins_cost(150);
 3632   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3633   ins_encode %{
 3634     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3635   %}
 3636   ins_pipe(pipe_slow);
 3637 %}
 3638 
 3639 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3640   predicate(UseAVX > 0);
 3641   match(Set dst (AbsF src));
 3642   ins_cost(150);
 3643   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3644   ins_encode %{
 3645     int vlen_enc = Assembler::AVX_128bit;
 3646     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3647               ExternalAddress(float_signmask()), vlen_enc);
 3648   %}
 3649   ins_pipe(pipe_slow);
 3650 %}
 3651 
 3652 instruct absD_reg(regD dst) %{
 3653   predicate((UseSSE>=2) && (UseAVX == 0));
 3654   match(Set dst (AbsD dst));
 3655   ins_cost(150);
 3656   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3657             "# abs double by sign masking" %}
 3658   ins_encode %{
 3659     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3660   %}
 3661   ins_pipe(pipe_slow);
 3662 %}
 3663 
 3664 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3665   predicate(UseAVX > 0);
 3666   match(Set dst (AbsD src));
 3667   ins_cost(150);
 3668   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3669             "# abs double by sign masking" %}
 3670   ins_encode %{
 3671     int vlen_enc = Assembler::AVX_128bit;
 3672     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3673               ExternalAddress(double_signmask()), vlen_enc);
 3674   %}
 3675   ins_pipe(pipe_slow);
 3676 %}
 3677 
 3678 instruct negF_reg(regF dst) %{
 3679   predicate((UseSSE>=1) && (UseAVX == 0));
 3680   match(Set dst (NegF dst));
 3681   ins_cost(150);
 3682   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3683   ins_encode %{
 3684     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3685   %}
 3686   ins_pipe(pipe_slow);
 3687 %}
 3688 
 3689 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3690   predicate(UseAVX > 0);
 3691   match(Set dst (NegF src));
 3692   ins_cost(150);
 3693   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3694   ins_encode %{
 3695     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3696                  ExternalAddress(float_signflip()));
 3697   %}
 3698   ins_pipe(pipe_slow);
 3699 %}
 3700 
 3701 instruct negD_reg(regD dst) %{
 3702   predicate((UseSSE>=2) && (UseAVX == 0));
 3703   match(Set dst (NegD dst));
 3704   ins_cost(150);
 3705   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3706             "# neg double by sign flipping" %}
 3707   ins_encode %{
 3708     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3709   %}
 3710   ins_pipe(pipe_slow);
 3711 %}
 3712 
 3713 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3714   predicate(UseAVX > 0);
 3715   match(Set dst (NegD src));
 3716   ins_cost(150);
 3717   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3718             "# neg double by sign flipping" %}
 3719   ins_encode %{
 3720     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3721                  ExternalAddress(double_signflip()));
 3722   %}
 3723   ins_pipe(pipe_slow);
 3724 %}
 3725 
 3726 // sqrtss instruction needs destination register to be pre initialized for best performance
 3727 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3728 instruct sqrtF_reg(regF dst) %{
 3729   predicate(UseSSE>=1);
 3730   match(Set dst (SqrtF dst));
 3731   format %{ "sqrtss  $dst, $dst" %}
 3732   ins_encode %{
 3733     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3734   %}
 3735   ins_pipe(pipe_slow);
 3736 %}
 3737 
 3738 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3739 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3740 instruct sqrtD_reg(regD dst) %{
 3741   predicate(UseSSE>=2);
 3742   match(Set dst (SqrtD dst));
 3743   format %{ "sqrtsd  $dst, $dst" %}
 3744   ins_encode %{
 3745     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3746   %}
 3747   ins_pipe(pipe_slow);
 3748 %}
 3749 
 3750 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3751   effect(TEMP tmp);
 3752   match(Set dst (ConvF2HF src));
 3753   ins_cost(125);
 3754   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3755   ins_encode %{
 3756     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3757   %}
 3758   ins_pipe( pipe_slow );
 3759 %}
 3760 
 3761 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3762   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3763   effect(TEMP ktmp, TEMP rtmp);
 3764   match(Set mem (StoreC mem (ConvF2HF src)));
 3765   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3766   ins_encode %{
 3767     __ movl($rtmp$$Register, 0x1);
 3768     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3769     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3770   %}
 3771   ins_pipe( pipe_slow );
 3772 %}
 3773 
 3774 instruct vconvF2HF(vec dst, vec src) %{
 3775   match(Set dst (VectorCastF2HF src));
 3776   format %{ "vector_conv_F2HF $dst $src" %}
 3777   ins_encode %{
 3778     int vlen_enc = vector_length_encoding(this, $src);
 3779     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3780   %}
 3781   ins_pipe( pipe_slow );
 3782 %}
 3783 
 3784 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3785   predicate(n->as_StoreVector()->memory_size() >= 16);
 3786   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3787   format %{ "vcvtps2ph $mem,$src" %}
 3788   ins_encode %{
 3789     int vlen_enc = vector_length_encoding(this, $src);
 3790     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3791   %}
 3792   ins_pipe( pipe_slow );
 3793 %}
 3794 
 3795 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3796   match(Set dst (ConvHF2F src));
 3797   format %{ "vcvtph2ps $dst,$src" %}
 3798   ins_encode %{
 3799     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3800   %}
 3801   ins_pipe( pipe_slow );
 3802 %}
 3803 
 3804 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3805   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3806   format %{ "vcvtph2ps $dst,$mem" %}
 3807   ins_encode %{
 3808     int vlen_enc = vector_length_encoding(this);
 3809     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3810   %}
 3811   ins_pipe( pipe_slow );
 3812 %}
 3813 
 3814 instruct vconvHF2F(vec dst, vec src) %{
 3815   match(Set dst (VectorCastHF2F src));
 3816   ins_cost(125);
 3817   format %{ "vector_conv_HF2F $dst,$src" %}
 3818   ins_encode %{
 3819     int vlen_enc = vector_length_encoding(this);
 3820     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3821   %}
 3822   ins_pipe( pipe_slow );
 3823 %}
 3824 
 3825 // ---------------------------------------- VectorReinterpret ------------------------------------
 3826 instruct reinterpret_mask(kReg dst) %{
 3827   predicate(n->bottom_type()->isa_vectmask() &&
 3828             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3829   match(Set dst (VectorReinterpret dst));
 3830   ins_cost(125);
 3831   format %{ "vector_reinterpret $dst\t!" %}
 3832   ins_encode %{
 3833     // empty
 3834   %}
 3835   ins_pipe( pipe_slow );
 3836 %}
 3837 
 3838 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3839   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3840             n->bottom_type()->isa_vectmask() &&
 3841             n->in(1)->bottom_type()->isa_vectmask() &&
 3842             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3843             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3844   match(Set dst (VectorReinterpret src));
 3845   effect(TEMP xtmp);
 3846   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3847   ins_encode %{
 3848      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3849      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3850      assert(src_sz == dst_sz , "src and dst size mismatch");
 3851      int vlen_enc = vector_length_encoding(src_sz);
 3852      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3853      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3854   %}
 3855   ins_pipe( pipe_slow );
 3856 %}
 3857 
 3858 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3859   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3860             n->bottom_type()->isa_vectmask() &&
 3861             n->in(1)->bottom_type()->isa_vectmask() &&
 3862             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3863              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3864             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3865   match(Set dst (VectorReinterpret src));
 3866   effect(TEMP xtmp);
 3867   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3868   ins_encode %{
 3869      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3870      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3871      assert(src_sz == dst_sz , "src and dst size mismatch");
 3872      int vlen_enc = vector_length_encoding(src_sz);
 3873      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3874      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3875   %}
 3876   ins_pipe( pipe_slow );
 3877 %}
 3878 
 3879 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3880   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3881             n->bottom_type()->isa_vectmask() &&
 3882             n->in(1)->bottom_type()->isa_vectmask() &&
 3883             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3884              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3885             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3886   match(Set dst (VectorReinterpret src));
 3887   effect(TEMP xtmp);
 3888   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3889   ins_encode %{
 3890      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3891      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3892      assert(src_sz == dst_sz , "src and dst size mismatch");
 3893      int vlen_enc = vector_length_encoding(src_sz);
 3894      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3895      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3896   %}
 3897   ins_pipe( pipe_slow );
 3898 %}
 3899 
 3900 instruct reinterpret(vec dst) %{
 3901   predicate(!n->bottom_type()->isa_vectmask() &&
 3902             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3903   match(Set dst (VectorReinterpret dst));
 3904   ins_cost(125);
 3905   format %{ "vector_reinterpret $dst\t!" %}
 3906   ins_encode %{
 3907     // empty
 3908   %}
 3909   ins_pipe( pipe_slow );
 3910 %}
 3911 
 3912 instruct reinterpret_expand(vec dst, vec src) %{
 3913   predicate(UseAVX == 0 &&
 3914             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3915   match(Set dst (VectorReinterpret src));
 3916   ins_cost(125);
 3917   effect(TEMP dst);
 3918   format %{ "vector_reinterpret_expand $dst,$src" %}
 3919   ins_encode %{
 3920     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3921     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3922 
 3923     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3924     if (src_vlen_in_bytes == 4) {
 3925       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3926     } else {
 3927       assert(src_vlen_in_bytes == 8, "");
 3928       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3929     }
 3930     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3931   %}
 3932   ins_pipe( pipe_slow );
 3933 %}
 3934 
 3935 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3936   predicate(UseAVX > 0 &&
 3937             !n->bottom_type()->isa_vectmask() &&
 3938             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3939             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3940   match(Set dst (VectorReinterpret src));
 3941   ins_cost(125);
 3942   format %{ "vector_reinterpret_expand $dst,$src" %}
 3943   ins_encode %{
 3944     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3945   %}
 3946   ins_pipe( pipe_slow );
 3947 %}
 3948 
 3949 
 3950 instruct vreinterpret_expand(legVec dst, vec src) %{
 3951   predicate(UseAVX > 0 &&
 3952             !n->bottom_type()->isa_vectmask() &&
 3953             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3954             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3955   match(Set dst (VectorReinterpret src));
 3956   ins_cost(125);
 3957   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3958   ins_encode %{
 3959     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3960       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3961       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3962       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3963       default: ShouldNotReachHere();
 3964     }
 3965   %}
 3966   ins_pipe( pipe_slow );
 3967 %}
 3968 
 3969 instruct reinterpret_shrink(vec dst, legVec src) %{
 3970   predicate(!n->bottom_type()->isa_vectmask() &&
 3971             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3972   match(Set dst (VectorReinterpret src));
 3973   ins_cost(125);
 3974   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3975   ins_encode %{
 3976     switch (Matcher::vector_length_in_bytes(this)) {
 3977       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3978       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3979       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3980       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3981       default: ShouldNotReachHere();
 3982     }
 3983   %}
 3984   ins_pipe( pipe_slow );
 3985 %}
 3986 
 3987 // ----------------------------------------------------------------------------------------------------
 3988 
 3989 #ifdef _LP64
 3990 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3991   match(Set dst (RoundDoubleMode src rmode));
 3992   format %{ "roundsd $dst,$src" %}
 3993   ins_cost(150);
 3994   ins_encode %{
 3995     assert(UseSSE >= 4, "required");
 3996     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3997       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3998     }
 3999     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 4000   %}
 4001   ins_pipe(pipe_slow);
 4002 %}
 4003 
 4004 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 4005   match(Set dst (RoundDoubleMode con rmode));
 4006   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 4007   ins_cost(150);
 4008   ins_encode %{
 4009     assert(UseSSE >= 4, "required");
 4010     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 4011   %}
 4012   ins_pipe(pipe_slow);
 4013 %}
 4014 
 4015 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 4016   predicate(Matcher::vector_length(n) < 8);
 4017   match(Set dst (RoundDoubleModeV src rmode));
 4018   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 4019   ins_encode %{
 4020     assert(UseAVX > 0, "required");
 4021     int vlen_enc = vector_length_encoding(this);
 4022     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 4023   %}
 4024   ins_pipe( pipe_slow );
 4025 %}
 4026 
 4027 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 4028   predicate(Matcher::vector_length(n) == 8);
 4029   match(Set dst (RoundDoubleModeV src rmode));
 4030   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 4031   ins_encode %{
 4032     assert(UseAVX > 2, "required");
 4033     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 4034   %}
 4035   ins_pipe( pipe_slow );
 4036 %}
 4037 
 4038 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 4039   predicate(Matcher::vector_length(n) < 8);
 4040   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4041   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 4042   ins_encode %{
 4043     assert(UseAVX > 0, "required");
 4044     int vlen_enc = vector_length_encoding(this);
 4045     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 4046   %}
 4047   ins_pipe( pipe_slow );
 4048 %}
 4049 
 4050 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 4051   predicate(Matcher::vector_length(n) == 8);
 4052   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4053   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 4054   ins_encode %{
 4055     assert(UseAVX > 2, "required");
 4056     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 4057   %}
 4058   ins_pipe( pipe_slow );
 4059 %}
 4060 #endif // _LP64
 4061 
 4062 instruct onspinwait() %{
 4063   match(OnSpinWait);
 4064   ins_cost(200);
 4065 
 4066   format %{
 4067     $$template
 4068     $$emit$$"pause\t! membar_onspinwait"
 4069   %}
 4070   ins_encode %{
 4071     __ pause();
 4072   %}
 4073   ins_pipe(pipe_slow);
 4074 %}
 4075 
 4076 // a * b + c
 4077 instruct fmaD_reg(regD a, regD b, regD c) %{
 4078   match(Set c (FmaD  c (Binary a b)));
 4079   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4080   ins_cost(150);
 4081   ins_encode %{
 4082     assert(UseFMA, "Needs FMA instructions support.");
 4083     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4084   %}
 4085   ins_pipe( pipe_slow );
 4086 %}
 4087 
 4088 // a * b + c
 4089 instruct fmaF_reg(regF a, regF b, regF c) %{
 4090   match(Set c (FmaF  c (Binary a b)));
 4091   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4092   ins_cost(150);
 4093   ins_encode %{
 4094     assert(UseFMA, "Needs FMA instructions support.");
 4095     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4096   %}
 4097   ins_pipe( pipe_slow );
 4098 %}
 4099 
 4100 // ====================VECTOR INSTRUCTIONS=====================================
 4101 
 4102 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4103 instruct MoveVec2Leg(legVec dst, vec src) %{
 4104   match(Set dst src);
 4105   format %{ "" %}
 4106   ins_encode %{
 4107     ShouldNotReachHere();
 4108   %}
 4109   ins_pipe( fpu_reg_reg );
 4110 %}
 4111 
 4112 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4113   match(Set dst src);
 4114   format %{ "" %}
 4115   ins_encode %{
 4116     ShouldNotReachHere();
 4117   %}
 4118   ins_pipe( fpu_reg_reg );
 4119 %}
 4120 
 4121 // ============================================================================
 4122 
 4123 // Load vectors generic operand pattern
 4124 instruct loadV(vec dst, memory mem) %{
 4125   match(Set dst (LoadVector mem));
 4126   ins_cost(125);
 4127   format %{ "load_vector $dst,$mem" %}
 4128   ins_encode %{
 4129     BasicType bt = Matcher::vector_element_basic_type(this);
 4130     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4131   %}
 4132   ins_pipe( pipe_slow );
 4133 %}
 4134 
 4135 // Store vectors generic operand pattern.
 4136 instruct storeV(memory mem, vec src) %{
 4137   match(Set mem (StoreVector mem src));
 4138   ins_cost(145);
 4139   format %{ "store_vector $mem,$src\n\t" %}
 4140   ins_encode %{
 4141     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4142       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4143       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4144       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4145       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4146       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4147       default: ShouldNotReachHere();
 4148     }
 4149   %}
 4150   ins_pipe( pipe_slow );
 4151 %}
 4152 
 4153 // ---------------------------------------- Gather ------------------------------------
 4154 
 4155 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4156 
 4157 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4158   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4159             Matcher::vector_length_in_bytes(n) <= 32);
 4160   match(Set dst (LoadVectorGather mem idx));
 4161   effect(TEMP dst, TEMP tmp, TEMP mask);
 4162   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4163   ins_encode %{
 4164     int vlen_enc = vector_length_encoding(this);
 4165     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4166     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4167     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4168     __ lea($tmp$$Register, $mem$$Address);
 4169     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4170   %}
 4171   ins_pipe( pipe_slow );
 4172 %}
 4173 
 4174 
 4175 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4176   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4177             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4178   match(Set dst (LoadVectorGather mem idx));
 4179   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4180   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4181   ins_encode %{
 4182     int vlen_enc = vector_length_encoding(this);
 4183     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4184     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4185     __ lea($tmp$$Register, $mem$$Address);
 4186     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4187   %}
 4188   ins_pipe( pipe_slow );
 4189 %}
 4190 
 4191 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4192   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4193             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4194   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4195   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4196   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4197   ins_encode %{
 4198     assert(UseAVX > 2, "sanity");
 4199     int vlen_enc = vector_length_encoding(this);
 4200     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4201     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4202     // Note: Since gather instruction partially updates the opmask register used
 4203     // for predication hense moving mask operand to a temporary.
 4204     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4205     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4206     __ lea($tmp$$Register, $mem$$Address);
 4207     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4208   %}
 4209   ins_pipe( pipe_slow );
 4210 %}
 4211 
 4212 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
 4213   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4214   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4215   effect(TEMP tmp, TEMP rtmp);
 4216   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4217   ins_encode %{
 4218     int vlen_enc = vector_length_encoding(this);
 4219     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4220     __ lea($tmp$$Register, $mem$$Address);
 4221     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
 4222   %}
 4223   ins_pipe( pipe_slow );
 4224 %}
 4225 
 4226 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
 4227                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4228   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4229   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4230   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4231   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4232   ins_encode %{
 4233     int vlen_enc = vector_length_encoding(this);
 4234     int vector_len = Matcher::vector_length(this);
 4235     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4236     __ lea($tmp$$Register, $mem$$Address);
 4237     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4238     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
 4239                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4240   %}
 4241   ins_pipe( pipe_slow );
 4242 %}
 4243 
 4244 instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
 4245   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4246   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4247   effect(TEMP tmp, TEMP rtmp, KILL cr);
 4248   format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
 4249   ins_encode %{
 4250     int vlen_enc = vector_length_encoding(this);
 4251     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4252     __ lea($tmp$$Register, $mem$$Address);
 4253     __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
 4254   %}
 4255   ins_pipe( pipe_slow );
 4256 %}
 4257 
 4258 
 4259 instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
 4260                                  vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4261   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4262   match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
 4263   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4264   format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4265   ins_encode %{
 4266     int vlen_enc = vector_length_encoding(this);
 4267     int vector_len = Matcher::vector_length(this);
 4268     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4269     __ lea($tmp$$Register, $mem$$Address);
 4270     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4271     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
 4272                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4273   %}
 4274   ins_pipe( pipe_slow );
 4275 %}
 4276 
 4277 
 4278 #ifdef _LP64
 4279 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4280   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4281   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4282   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4283   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4284   ins_encode %{
 4285     int vlen_enc = vector_length_encoding(this);
 4286     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4287     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4288     __ lea($tmp$$Register, $mem$$Address);
 4289     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4290     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4291   %}
 4292   ins_pipe( pipe_slow );
 4293 %}
 4294 
 4295 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4296                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4297   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4298   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4299   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4300   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4301   ins_encode %{
 4302     int vlen_enc = vector_length_encoding(this);
 4303     int vector_len = Matcher::vector_length(this);
 4304     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4305     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4306     __ lea($tmp$$Register, $mem$$Address);
 4307     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4308     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4309     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4310                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4311   %}
 4312   ins_pipe( pipe_slow );
 4313 %}
 4314 
 4315 instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4316   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4317   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4318   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4319   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4320   ins_encode %{
 4321     int vlen_enc = vector_length_encoding(this);
 4322     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4323     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4324     __ lea($tmp$$Register, $mem$$Address);
 4325     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4326     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4327                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4328   %}
 4329   ins_pipe( pipe_slow );
 4330 %}
 4331 
 4332 instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4333                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4334   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4335   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4336   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4337   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4338   ins_encode %{
 4339     int vlen_enc = vector_length_encoding(this);
 4340     int vector_len = Matcher::vector_length(this);
 4341     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4342     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4343     __ lea($tmp$$Register, $mem$$Address);
 4344     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4345     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4346     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4347                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4348   %}
 4349   ins_pipe( pipe_slow );
 4350 %}
 4351 
 4352 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4353   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4354   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4355   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4356   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4357   ins_encode %{
 4358     int vlen_enc = vector_length_encoding(this);
 4359     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4360     __ lea($tmp$$Register, $mem$$Address);
 4361     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4362     if (elem_bt == T_SHORT) {
 4363       __ movl($mask_idx$$Register, 0x55555555);
 4364       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4365     }
 4366     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4367     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4368   %}
 4369   ins_pipe( pipe_slow );
 4370 %}
 4371 
 4372 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4373                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4374   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4375   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4376   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4377   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4378   ins_encode %{
 4379     int vlen_enc = vector_length_encoding(this);
 4380     int vector_len = Matcher::vector_length(this);
 4381     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4382     __ lea($tmp$$Register, $mem$$Address);
 4383     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4384     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4385     if (elem_bt == T_SHORT) {
 4386       __ movl($mask_idx$$Register, 0x55555555);
 4387       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4388     }
 4389     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4390     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4391                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4392   %}
 4393   ins_pipe( pipe_slow );
 4394 %}
 4395 
 4396 instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4397   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4398   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4399   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4400   format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4401   ins_encode %{
 4402     int vlen_enc = vector_length_encoding(this);
 4403     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4404     __ lea($tmp$$Register, $mem$$Address);
 4405     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4406     if (elem_bt == T_SHORT) {
 4407       __ movl($mask_idx$$Register, 0x55555555);
 4408       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4409     }
 4410     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4411     __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
 4412                                 $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4413   %}
 4414   ins_pipe( pipe_slow );
 4415 %}
 4416 
 4417 instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
 4418                                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4419   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4420   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
 4421   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4422   format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4423   ins_encode %{
 4424     int vlen_enc = vector_length_encoding(this);
 4425     int vector_len = Matcher::vector_length(this);
 4426     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4427     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4428     __ lea($tmp$$Register, $mem$$Address);
 4429     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4430     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4431     if (elem_bt == T_SHORT) {
 4432       __ movl($mask_idx$$Register, 0x55555555);
 4433       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4434     }
 4435     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4436     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4437                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4438   %}
 4439   ins_pipe( pipe_slow );
 4440 %}
 4441 #endif
 4442 
 4443 // ====================Scatter=======================================
 4444 
 4445 // Scatter INT, LONG, FLOAT, DOUBLE
 4446 
 4447 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4448   predicate(UseAVX > 2);
 4449   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4450   effect(TEMP tmp, TEMP ktmp);
 4451   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4452   ins_encode %{
 4453     int vlen_enc = vector_length_encoding(this, $src);
 4454     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4455 
 4456     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4457     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4458 
 4459     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4460     __ lea($tmp$$Register, $mem$$Address);
 4461     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4462   %}
 4463   ins_pipe( pipe_slow );
 4464 %}
 4465 
 4466 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4467   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4468   effect(TEMP tmp, TEMP ktmp);
 4469   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4470   ins_encode %{
 4471     int vlen_enc = vector_length_encoding(this, $src);
 4472     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4473     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4474     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4475     // Note: Since scatter instruction partially updates the opmask register used
 4476     // for predication hense moving mask operand to a temporary.
 4477     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4478     __ lea($tmp$$Register, $mem$$Address);
 4479     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4480   %}
 4481   ins_pipe( pipe_slow );
 4482 %}
 4483 
 4484 // ====================REPLICATE=======================================
 4485 
 4486 // Replicate byte scalar to be vector
 4487 instruct vReplB_reg(vec dst, rRegI src) %{
 4488   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4489   match(Set dst (Replicate src));
 4490   format %{ "replicateB $dst,$src" %}
 4491   ins_encode %{
 4492     uint vlen = Matcher::vector_length(this);
 4493     if (UseAVX >= 2) {
 4494       int vlen_enc = vector_length_encoding(this);
 4495       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4496         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4497         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4498       } else {
 4499         __ movdl($dst$$XMMRegister, $src$$Register);
 4500         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4501       }
 4502     } else {
 4503        assert(UseAVX < 2, "");
 4504       __ movdl($dst$$XMMRegister, $src$$Register);
 4505       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4506       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4507       if (vlen >= 16) {
 4508         assert(vlen == 16, "");
 4509         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4510       }
 4511     }
 4512   %}
 4513   ins_pipe( pipe_slow );
 4514 %}
 4515 
 4516 instruct ReplB_mem(vec dst, memory mem) %{
 4517   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4518   match(Set dst (Replicate (LoadB mem)));
 4519   format %{ "replicateB $dst,$mem" %}
 4520   ins_encode %{
 4521     int vlen_enc = vector_length_encoding(this);
 4522     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4523   %}
 4524   ins_pipe( pipe_slow );
 4525 %}
 4526 
 4527 // ====================ReplicateS=======================================
 4528 
 4529 instruct vReplS_reg(vec dst, rRegI src) %{
 4530   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4531   match(Set dst (Replicate src));
 4532   format %{ "replicateS $dst,$src" %}
 4533   ins_encode %{
 4534     uint vlen = Matcher::vector_length(this);
 4535     int vlen_enc = vector_length_encoding(this);
 4536     if (UseAVX >= 2) {
 4537       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4538         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4539         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4540       } else {
 4541         __ movdl($dst$$XMMRegister, $src$$Register);
 4542         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4543       }
 4544     } else {
 4545       assert(UseAVX < 2, "");
 4546       __ movdl($dst$$XMMRegister, $src$$Register);
 4547       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4548       if (vlen >= 8) {
 4549         assert(vlen == 8, "");
 4550         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4551       }
 4552     }
 4553   %}
 4554   ins_pipe( pipe_slow );
 4555 %}
 4556 
 4557 instruct ReplS_mem(vec dst, memory mem) %{
 4558   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4559   match(Set dst (Replicate (LoadS mem)));
 4560   format %{ "replicateS $dst,$mem" %}
 4561   ins_encode %{
 4562     int vlen_enc = vector_length_encoding(this);
 4563     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4564   %}
 4565   ins_pipe( pipe_slow );
 4566 %}
 4567 
 4568 // ====================ReplicateI=======================================
 4569 
 4570 instruct ReplI_reg(vec dst, rRegI src) %{
 4571   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4572   match(Set dst (Replicate src));
 4573   format %{ "replicateI $dst,$src" %}
 4574   ins_encode %{
 4575     uint vlen = Matcher::vector_length(this);
 4576     int vlen_enc = vector_length_encoding(this);
 4577     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4578       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4579     } else if (VM_Version::supports_avx2()) {
 4580       __ movdl($dst$$XMMRegister, $src$$Register);
 4581       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4582     } else {
 4583       __ movdl($dst$$XMMRegister, $src$$Register);
 4584       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4585     }
 4586   %}
 4587   ins_pipe( pipe_slow );
 4588 %}
 4589 
 4590 instruct ReplI_mem(vec dst, memory mem) %{
 4591   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4592   match(Set dst (Replicate (LoadI mem)));
 4593   format %{ "replicateI $dst,$mem" %}
 4594   ins_encode %{
 4595     int vlen_enc = vector_length_encoding(this);
 4596     if (VM_Version::supports_avx2()) {
 4597       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4598     } else if (VM_Version::supports_avx()) {
 4599       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4600     } else {
 4601       __ movdl($dst$$XMMRegister, $mem$$Address);
 4602       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4603     }
 4604   %}
 4605   ins_pipe( pipe_slow );
 4606 %}
 4607 
 4608 instruct ReplI_imm(vec dst, immI con) %{
 4609   predicate(Matcher::is_non_long_integral_vector(n));
 4610   match(Set dst (Replicate con));
 4611   format %{ "replicateI $dst,$con" %}
 4612   ins_encode %{
 4613     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4614                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4615                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4616     BasicType bt = Matcher::vector_element_basic_type(this);
 4617     int vlen = Matcher::vector_length_in_bytes(this);
 4618     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4619   %}
 4620   ins_pipe( pipe_slow );
 4621 %}
 4622 
 4623 // Replicate scalar zero to be vector
 4624 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4625   predicate(Matcher::is_non_long_integral_vector(n));
 4626   match(Set dst (Replicate zero));
 4627   format %{ "replicateI $dst,$zero" %}
 4628   ins_encode %{
 4629     int vlen_enc = vector_length_encoding(this);
 4630     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4631       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4632     } else {
 4633       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4634     }
 4635   %}
 4636   ins_pipe( fpu_reg_reg );
 4637 %}
 4638 
 4639 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4640   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4641   match(Set dst (Replicate con));
 4642   format %{ "vallones $dst" %}
 4643   ins_encode %{
 4644     int vector_len = vector_length_encoding(this);
 4645     __ vallones($dst$$XMMRegister, vector_len);
 4646   %}
 4647   ins_pipe( pipe_slow );
 4648 %}
 4649 
 4650 // ====================ReplicateL=======================================
 4651 
 4652 #ifdef _LP64
 4653 // Replicate long (8 byte) scalar to be vector
 4654 instruct ReplL_reg(vec dst, rRegL src) %{
 4655   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4656   match(Set dst (Replicate src));
 4657   format %{ "replicateL $dst,$src" %}
 4658   ins_encode %{
 4659     int vlen = Matcher::vector_length(this);
 4660     int vlen_enc = vector_length_encoding(this);
 4661     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4662       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4663     } else if (VM_Version::supports_avx2()) {
 4664       __ movdq($dst$$XMMRegister, $src$$Register);
 4665       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4666     } else {
 4667       __ movdq($dst$$XMMRegister, $src$$Register);
 4668       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4669     }
 4670   %}
 4671   ins_pipe( pipe_slow );
 4672 %}
 4673 #else // _LP64
 4674 // Replicate long (8 byte) scalar to be vector
 4675 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4676   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4677   match(Set dst (Replicate src));
 4678   effect(TEMP dst, USE src, TEMP tmp);
 4679   format %{ "replicateL $dst,$src" %}
 4680   ins_encode %{
 4681     uint vlen = Matcher::vector_length(this);
 4682     if (vlen == 2) {
 4683       __ movdl($dst$$XMMRegister, $src$$Register);
 4684       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4685       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4686       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4687     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4688       int vlen_enc = Assembler::AVX_256bit;
 4689       __ movdl($dst$$XMMRegister, $src$$Register);
 4690       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4691       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4692       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4693     } else {
 4694       __ movdl($dst$$XMMRegister, $src$$Register);
 4695       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4696       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4697       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4698       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4699     }
 4700   %}
 4701   ins_pipe( pipe_slow );
 4702 %}
 4703 
 4704 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4705   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4706   match(Set dst (Replicate src));
 4707   effect(TEMP dst, USE src, TEMP tmp);
 4708   format %{ "replicateL $dst,$src" %}
 4709   ins_encode %{
 4710     if (VM_Version::supports_avx512vl()) {
 4711       __ movdl($dst$$XMMRegister, $src$$Register);
 4712       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4713       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4714       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4715       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4716       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4717     } else {
 4718       int vlen_enc = Assembler::AVX_512bit;
 4719       __ movdl($dst$$XMMRegister, $src$$Register);
 4720       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4721       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4722       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4723     }
 4724   %}
 4725   ins_pipe( pipe_slow );
 4726 %}
 4727 #endif // _LP64
 4728 
 4729 instruct ReplL_mem(vec dst, memory mem) %{
 4730   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4731   match(Set dst (Replicate (LoadL mem)));
 4732   format %{ "replicateL $dst,$mem" %}
 4733   ins_encode %{
 4734     int vlen_enc = vector_length_encoding(this);
 4735     if (VM_Version::supports_avx2()) {
 4736       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4737     } else if (VM_Version::supports_sse3()) {
 4738       __ movddup($dst$$XMMRegister, $mem$$Address);
 4739     } else {
 4740       __ movq($dst$$XMMRegister, $mem$$Address);
 4741       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4742     }
 4743   %}
 4744   ins_pipe( pipe_slow );
 4745 %}
 4746 
 4747 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4748 instruct ReplL_imm(vec dst, immL con) %{
 4749   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4750   match(Set dst (Replicate con));
 4751   format %{ "replicateL $dst,$con" %}
 4752   ins_encode %{
 4753     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4754     int vlen = Matcher::vector_length_in_bytes(this);
 4755     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4756   %}
 4757   ins_pipe( pipe_slow );
 4758 %}
 4759 
 4760 instruct ReplL_zero(vec dst, immL0 zero) %{
 4761   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4762   match(Set dst (Replicate zero));
 4763   format %{ "replicateL $dst,$zero" %}
 4764   ins_encode %{
 4765     int vlen_enc = vector_length_encoding(this);
 4766     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4767       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4768     } else {
 4769       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4770     }
 4771   %}
 4772   ins_pipe( fpu_reg_reg );
 4773 %}
 4774 
 4775 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4776   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4777   match(Set dst (Replicate con));
 4778   format %{ "vallones $dst" %}
 4779   ins_encode %{
 4780     int vector_len = vector_length_encoding(this);
 4781     __ vallones($dst$$XMMRegister, vector_len);
 4782   %}
 4783   ins_pipe( pipe_slow );
 4784 %}
 4785 
 4786 // ====================ReplicateF=======================================
 4787 
 4788 instruct vReplF_reg(vec dst, vlRegF src) %{
 4789   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4790   match(Set dst (Replicate src));
 4791   format %{ "replicateF $dst,$src" %}
 4792   ins_encode %{
 4793     uint vlen = Matcher::vector_length(this);
 4794     int vlen_enc = vector_length_encoding(this);
 4795     if (vlen <= 4) {
 4796       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4797     } else if (VM_Version::supports_avx2()) {
 4798       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4799     } else {
 4800       assert(vlen == 8, "sanity");
 4801       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4802       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4803     }
 4804   %}
 4805   ins_pipe( pipe_slow );
 4806 %}
 4807 
 4808 instruct ReplF_reg(vec dst, vlRegF src) %{
 4809   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4810   match(Set dst (Replicate src));
 4811   format %{ "replicateF $dst,$src" %}
 4812   ins_encode %{
 4813     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4814   %}
 4815   ins_pipe( pipe_slow );
 4816 %}
 4817 
 4818 instruct ReplF_mem(vec dst, memory mem) %{
 4819   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4820   match(Set dst (Replicate (LoadF mem)));
 4821   format %{ "replicateF $dst,$mem" %}
 4822   ins_encode %{
 4823     int vlen_enc = vector_length_encoding(this);
 4824     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4825   %}
 4826   ins_pipe( pipe_slow );
 4827 %}
 4828 
 4829 // Replicate float scalar immediate to be vector by loading from const table.
 4830 instruct ReplF_imm(vec dst, immF con) %{
 4831   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4832   match(Set dst (Replicate con));
 4833   format %{ "replicateF $dst,$con" %}
 4834   ins_encode %{
 4835     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4836                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4837     int vlen = Matcher::vector_length_in_bytes(this);
 4838     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4839   %}
 4840   ins_pipe( pipe_slow );
 4841 %}
 4842 
 4843 instruct ReplF_zero(vec dst, immF0 zero) %{
 4844   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4845   match(Set dst (Replicate zero));
 4846   format %{ "replicateF $dst,$zero" %}
 4847   ins_encode %{
 4848     int vlen_enc = vector_length_encoding(this);
 4849     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4850       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4851     } else {
 4852       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4853     }
 4854   %}
 4855   ins_pipe( fpu_reg_reg );
 4856 %}
 4857 
 4858 // ====================ReplicateD=======================================
 4859 
 4860 // Replicate double (8 bytes) scalar to be vector
 4861 instruct vReplD_reg(vec dst, vlRegD src) %{
 4862   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4863   match(Set dst (Replicate src));
 4864   format %{ "replicateD $dst,$src" %}
 4865   ins_encode %{
 4866     uint vlen = Matcher::vector_length(this);
 4867     int vlen_enc = vector_length_encoding(this);
 4868     if (vlen <= 2) {
 4869       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4870     } else if (VM_Version::supports_avx2()) {
 4871       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4872     } else {
 4873       assert(vlen == 4, "sanity");
 4874       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4875       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4876     }
 4877   %}
 4878   ins_pipe( pipe_slow );
 4879 %}
 4880 
 4881 instruct ReplD_reg(vec dst, vlRegD src) %{
 4882   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4883   match(Set dst (Replicate src));
 4884   format %{ "replicateD $dst,$src" %}
 4885   ins_encode %{
 4886     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4887   %}
 4888   ins_pipe( pipe_slow );
 4889 %}
 4890 
 4891 instruct ReplD_mem(vec dst, memory mem) %{
 4892   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4893   match(Set dst (Replicate (LoadD mem)));
 4894   format %{ "replicateD $dst,$mem" %}
 4895   ins_encode %{
 4896     if (Matcher::vector_length(this) >= 4) {
 4897       int vlen_enc = vector_length_encoding(this);
 4898       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4899     } else {
 4900       __ movddup($dst$$XMMRegister, $mem$$Address);
 4901     }
 4902   %}
 4903   ins_pipe( pipe_slow );
 4904 %}
 4905 
 4906 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4907 instruct ReplD_imm(vec dst, immD con) %{
 4908   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4909   match(Set dst (Replicate con));
 4910   format %{ "replicateD $dst,$con" %}
 4911   ins_encode %{
 4912     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4913     int vlen = Matcher::vector_length_in_bytes(this);
 4914     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4915   %}
 4916   ins_pipe( pipe_slow );
 4917 %}
 4918 
 4919 instruct ReplD_zero(vec dst, immD0 zero) %{
 4920   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4921   match(Set dst (Replicate zero));
 4922   format %{ "replicateD $dst,$zero" %}
 4923   ins_encode %{
 4924     int vlen_enc = vector_length_encoding(this);
 4925     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4926       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4927     } else {
 4928       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4929     }
 4930   %}
 4931   ins_pipe( fpu_reg_reg );
 4932 %}
 4933 
 4934 // ====================VECTOR INSERT=======================================
 4935 
 4936 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4937   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4938   match(Set dst (VectorInsert (Binary dst val) idx));
 4939   format %{ "vector_insert $dst,$val,$idx" %}
 4940   ins_encode %{
 4941     assert(UseSSE >= 4, "required");
 4942     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4943 
 4944     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4945 
 4946     assert(is_integral_type(elem_bt), "");
 4947     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4948 
 4949     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4950   %}
 4951   ins_pipe( pipe_slow );
 4952 %}
 4953 
 4954 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4955   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4956   match(Set dst (VectorInsert (Binary src val) idx));
 4957   effect(TEMP vtmp);
 4958   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4959   ins_encode %{
 4960     int vlen_enc = Assembler::AVX_256bit;
 4961     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4962     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4963     int log2epr = log2(elem_per_lane);
 4964 
 4965     assert(is_integral_type(elem_bt), "sanity");
 4966     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4967 
 4968     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4969     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4970     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4971     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4972     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4973   %}
 4974   ins_pipe( pipe_slow );
 4975 %}
 4976 
 4977 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4978   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4979   match(Set dst (VectorInsert (Binary src val) idx));
 4980   effect(TEMP vtmp);
 4981   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4982   ins_encode %{
 4983     assert(UseAVX > 2, "sanity");
 4984 
 4985     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4986     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4987     int log2epr = log2(elem_per_lane);
 4988 
 4989     assert(is_integral_type(elem_bt), "");
 4990     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4991 
 4992     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4993     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4994     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4995     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4996     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4997   %}
 4998   ins_pipe( pipe_slow );
 4999 %}
 5000 
 5001 #ifdef _LP64
 5002 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 5003   predicate(Matcher::vector_length(n) == 2);
 5004   match(Set dst (VectorInsert (Binary dst val) idx));
 5005   format %{ "vector_insert $dst,$val,$idx" %}
 5006   ins_encode %{
 5007     assert(UseSSE >= 4, "required");
 5008     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 5009     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5010 
 5011     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 5012   %}
 5013   ins_pipe( pipe_slow );
 5014 %}
 5015 
 5016 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 5017   predicate(Matcher::vector_length(n) == 4);
 5018   match(Set dst (VectorInsert (Binary src val) idx));
 5019   effect(TEMP vtmp);
 5020   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5021   ins_encode %{
 5022     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 5023     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5024 
 5025     uint x_idx = $idx$$constant & right_n_bits(1);
 5026     uint y_idx = ($idx$$constant >> 1) & 1;
 5027     int vlen_enc = Assembler::AVX_256bit;
 5028     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5029     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 5030     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5031   %}
 5032   ins_pipe( pipe_slow );
 5033 %}
 5034 
 5035 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 5036   predicate(Matcher::vector_length(n) == 8);
 5037   match(Set dst (VectorInsert (Binary src val) idx));
 5038   effect(TEMP vtmp);
 5039   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5040   ins_encode %{
 5041     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 5042     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5043 
 5044     uint x_idx = $idx$$constant & right_n_bits(1);
 5045     uint y_idx = ($idx$$constant >> 1) & 3;
 5046     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5047     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 5048     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5049   %}
 5050   ins_pipe( pipe_slow );
 5051 %}
 5052 #endif
 5053 
 5054 instruct insertF(vec dst, regF val, immU8 idx) %{
 5055   predicate(Matcher::vector_length(n) < 8);
 5056   match(Set dst (VectorInsert (Binary dst val) idx));
 5057   format %{ "vector_insert $dst,$val,$idx" %}
 5058   ins_encode %{
 5059     assert(UseSSE >= 4, "sanity");
 5060 
 5061     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5062     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5063 
 5064     uint x_idx = $idx$$constant & right_n_bits(2);
 5065     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5066   %}
 5067   ins_pipe( pipe_slow );
 5068 %}
 5069 
 5070 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 5071   predicate(Matcher::vector_length(n) >= 8);
 5072   match(Set dst (VectorInsert (Binary src val) idx));
 5073   effect(TEMP vtmp);
 5074   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5075   ins_encode %{
 5076     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 5077     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5078 
 5079     int vlen = Matcher::vector_length(this);
 5080     uint x_idx = $idx$$constant & right_n_bits(2);
 5081     if (vlen == 8) {
 5082       uint y_idx = ($idx$$constant >> 2) & 1;
 5083       int vlen_enc = Assembler::AVX_256bit;
 5084       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5085       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5086       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5087     } else {
 5088       assert(vlen == 16, "sanity");
 5089       uint y_idx = ($idx$$constant >> 2) & 3;
 5090       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5091       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 5092       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5093     }
 5094   %}
 5095   ins_pipe( pipe_slow );
 5096 %}
 5097 
 5098 #ifdef _LP64
 5099 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 5100   predicate(Matcher::vector_length(n) == 2);
 5101   match(Set dst (VectorInsert (Binary dst val) idx));
 5102   effect(TEMP tmp);
 5103   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 5104   ins_encode %{
 5105     assert(UseSSE >= 4, "sanity");
 5106     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5107     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5108 
 5109     __ movq($tmp$$Register, $val$$XMMRegister);
 5110     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 5111   %}
 5112   ins_pipe( pipe_slow );
 5113 %}
 5114 
 5115 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 5116   predicate(Matcher::vector_length(n) == 4);
 5117   match(Set dst (VectorInsert (Binary src val) idx));
 5118   effect(TEMP vtmp, TEMP tmp);
 5119   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 5120   ins_encode %{
 5121     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5122     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5123 
 5124     uint x_idx = $idx$$constant & right_n_bits(1);
 5125     uint y_idx = ($idx$$constant >> 1) & 1;
 5126     int vlen_enc = Assembler::AVX_256bit;
 5127     __ movq($tmp$$Register, $val$$XMMRegister);
 5128     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5129     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5130     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5131   %}
 5132   ins_pipe( pipe_slow );
 5133 %}
 5134 
 5135 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 5136   predicate(Matcher::vector_length(n) == 8);
 5137   match(Set dst (VectorInsert (Binary src val) idx));
 5138   effect(TEMP tmp, TEMP vtmp);
 5139   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 5140   ins_encode %{
 5141     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 5142     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 5143 
 5144     uint x_idx = $idx$$constant & right_n_bits(1);
 5145     uint y_idx = ($idx$$constant >> 1) & 3;
 5146     __ movq($tmp$$Register, $val$$XMMRegister);
 5147     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 5148     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 5149     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 5150   %}
 5151   ins_pipe( pipe_slow );
 5152 %}
 5153 #endif
 5154 
 5155 // ====================REDUCTION ARITHMETIC=======================================
 5156 
 5157 // =======================Int Reduction==========================================
 5158 
 5159 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5160   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 5161   match(Set dst (AddReductionVI src1 src2));
 5162   match(Set dst (MulReductionVI src1 src2));
 5163   match(Set dst (AndReductionV  src1 src2));
 5164   match(Set dst ( OrReductionV  src1 src2));
 5165   match(Set dst (XorReductionV  src1 src2));
 5166   match(Set dst (MinReductionV  src1 src2));
 5167   match(Set dst (MaxReductionV  src1 src2));
 5168   effect(TEMP vtmp1, TEMP vtmp2);
 5169   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5170   ins_encode %{
 5171     int opcode = this->ideal_Opcode();
 5172     int vlen = Matcher::vector_length(this, $src2);
 5173     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5174   %}
 5175   ins_pipe( pipe_slow );
 5176 %}
 5177 
 5178 // =======================Long Reduction==========================================
 5179 
 5180 #ifdef _LP64
 5181 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5182   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 5183   match(Set dst (AddReductionVL src1 src2));
 5184   match(Set dst (MulReductionVL src1 src2));
 5185   match(Set dst (AndReductionV  src1 src2));
 5186   match(Set dst ( OrReductionV  src1 src2));
 5187   match(Set dst (XorReductionV  src1 src2));
 5188   match(Set dst (MinReductionV  src1 src2));
 5189   match(Set dst (MaxReductionV  src1 src2));
 5190   effect(TEMP vtmp1, TEMP vtmp2);
 5191   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5192   ins_encode %{
 5193     int opcode = this->ideal_Opcode();
 5194     int vlen = Matcher::vector_length(this, $src2);
 5195     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5196   %}
 5197   ins_pipe( pipe_slow );
 5198 %}
 5199 
 5200 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 5201   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 5202   match(Set dst (AddReductionVL src1 src2));
 5203   match(Set dst (MulReductionVL src1 src2));
 5204   match(Set dst (AndReductionV  src1 src2));
 5205   match(Set dst ( OrReductionV  src1 src2));
 5206   match(Set dst (XorReductionV  src1 src2));
 5207   match(Set dst (MinReductionV  src1 src2));
 5208   match(Set dst (MaxReductionV  src1 src2));
 5209   effect(TEMP vtmp1, TEMP vtmp2);
 5210   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5211   ins_encode %{
 5212     int opcode = this->ideal_Opcode();
 5213     int vlen = Matcher::vector_length(this, $src2);
 5214     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5215   %}
 5216   ins_pipe( pipe_slow );
 5217 %}
 5218 #endif // _LP64
 5219 
 5220 // =======================Float Reduction==========================================
 5221 
 5222 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 5223   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 5224   match(Set dst (AddReductionVF dst src));
 5225   match(Set dst (MulReductionVF dst src));
 5226   effect(TEMP dst, TEMP vtmp);
 5227   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 5228   ins_encode %{
 5229     int opcode = this->ideal_Opcode();
 5230     int vlen = Matcher::vector_length(this, $src);
 5231     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5232   %}
 5233   ins_pipe( pipe_slow );
 5234 %}
 5235 
 5236 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 5237   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5238   match(Set dst (AddReductionVF dst src));
 5239   match(Set dst (MulReductionVF dst src));
 5240   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5241   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5242   ins_encode %{
 5243     int opcode = this->ideal_Opcode();
 5244     int vlen = Matcher::vector_length(this, $src);
 5245     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5246   %}
 5247   ins_pipe( pipe_slow );
 5248 %}
 5249 
 5250 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5251   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 5252   match(Set dst (AddReductionVF dst src));
 5253   match(Set dst (MulReductionVF dst src));
 5254   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5255   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5256   ins_encode %{
 5257     int opcode = this->ideal_Opcode();
 5258     int vlen = Matcher::vector_length(this, $src);
 5259     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5260   %}
 5261   ins_pipe( pipe_slow );
 5262 %}
 5263 
 5264 
 5265 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 5266   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5267   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5268   // src1 contains reduction identity
 5269   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5270   match(Set dst (AddReductionVF src1 src2));
 5271   match(Set dst (MulReductionVF src1 src2));
 5272   effect(TEMP dst);
 5273   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 5274   ins_encode %{
 5275     int opcode = this->ideal_Opcode();
 5276     int vlen = Matcher::vector_length(this, $src2);
 5277     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5278   %}
 5279   ins_pipe( pipe_slow );
 5280 %}
 5281 
 5282 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 5283   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5284   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5285   // src1 contains reduction identity
 5286   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5287   match(Set dst (AddReductionVF src1 src2));
 5288   match(Set dst (MulReductionVF src1 src2));
 5289   effect(TEMP dst, TEMP vtmp);
 5290   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5291   ins_encode %{
 5292     int opcode = this->ideal_Opcode();
 5293     int vlen = Matcher::vector_length(this, $src2);
 5294     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5295   %}
 5296   ins_pipe( pipe_slow );
 5297 %}
 5298 
 5299 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 5300   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5301   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5302   // src1 contains reduction identity
 5303   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5304   match(Set dst (AddReductionVF src1 src2));
 5305   match(Set dst (MulReductionVF src1 src2));
 5306   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5307   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5308   ins_encode %{
 5309     int opcode = this->ideal_Opcode();
 5310     int vlen = Matcher::vector_length(this, $src2);
 5311     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5312   %}
 5313   ins_pipe( pipe_slow );
 5314 %}
 5315 
 5316 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5317   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5318   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5319   // src1 contains reduction identity
 5320   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5321   match(Set dst (AddReductionVF src1 src2));
 5322   match(Set dst (MulReductionVF src1 src2));
 5323   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5324   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5325   ins_encode %{
 5326     int opcode = this->ideal_Opcode();
 5327     int vlen = Matcher::vector_length(this, $src2);
 5328     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5329   %}
 5330   ins_pipe( pipe_slow );
 5331 %}
 5332 
 5333 // =======================Double Reduction==========================================
 5334 
 5335 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5336   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5337   match(Set dst (AddReductionVD dst src));
 5338   match(Set dst (MulReductionVD dst src));
 5339   effect(TEMP dst, TEMP vtmp);
 5340   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5341   ins_encode %{
 5342     int opcode = this->ideal_Opcode();
 5343     int vlen = Matcher::vector_length(this, $src);
 5344     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5345 %}
 5346   ins_pipe( pipe_slow );
 5347 %}
 5348 
 5349 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5350   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5351   match(Set dst (AddReductionVD dst src));
 5352   match(Set dst (MulReductionVD dst src));
 5353   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5354   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5355   ins_encode %{
 5356     int opcode = this->ideal_Opcode();
 5357     int vlen = Matcher::vector_length(this, $src);
 5358     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5359   %}
 5360   ins_pipe( pipe_slow );
 5361 %}
 5362 
 5363 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5364   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5365   match(Set dst (AddReductionVD dst src));
 5366   match(Set dst (MulReductionVD dst src));
 5367   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5368   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5369   ins_encode %{
 5370     int opcode = this->ideal_Opcode();
 5371     int vlen = Matcher::vector_length(this, $src);
 5372     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5373   %}
 5374   ins_pipe( pipe_slow );
 5375 %}
 5376 
 5377 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5378   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5379   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5380   // src1 contains reduction identity
 5381   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5382   match(Set dst (AddReductionVD src1 src2));
 5383   match(Set dst (MulReductionVD src1 src2));
 5384   effect(TEMP dst);
 5385   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5386   ins_encode %{
 5387     int opcode = this->ideal_Opcode();
 5388     int vlen = Matcher::vector_length(this, $src2);
 5389     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5390 %}
 5391   ins_pipe( pipe_slow );
 5392 %}
 5393 
 5394 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5395   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5396   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5397   // src1 contains reduction identity
 5398   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5399   match(Set dst (AddReductionVD src1 src2));
 5400   match(Set dst (MulReductionVD src1 src2));
 5401   effect(TEMP dst, TEMP vtmp);
 5402   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5403   ins_encode %{
 5404     int opcode = this->ideal_Opcode();
 5405     int vlen = Matcher::vector_length(this, $src2);
 5406     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5407   %}
 5408   ins_pipe( pipe_slow );
 5409 %}
 5410 
 5411 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5412   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5413   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5414   // src1 contains reduction identity
 5415   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5416   match(Set dst (AddReductionVD src1 src2));
 5417   match(Set dst (MulReductionVD src1 src2));
 5418   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5419   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5420   ins_encode %{
 5421     int opcode = this->ideal_Opcode();
 5422     int vlen = Matcher::vector_length(this, $src2);
 5423     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5424   %}
 5425   ins_pipe( pipe_slow );
 5426 %}
 5427 
 5428 // =======================Byte Reduction==========================================
 5429 
 5430 #ifdef _LP64
 5431 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5432   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5433   match(Set dst (AddReductionVI src1 src2));
 5434   match(Set dst (AndReductionV  src1 src2));
 5435   match(Set dst ( OrReductionV  src1 src2));
 5436   match(Set dst (XorReductionV  src1 src2));
 5437   match(Set dst (MinReductionV  src1 src2));
 5438   match(Set dst (MaxReductionV  src1 src2));
 5439   effect(TEMP vtmp1, TEMP vtmp2);
 5440   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5441   ins_encode %{
 5442     int opcode = this->ideal_Opcode();
 5443     int vlen = Matcher::vector_length(this, $src2);
 5444     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5445   %}
 5446   ins_pipe( pipe_slow );
 5447 %}
 5448 
 5449 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5450   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5451   match(Set dst (AddReductionVI src1 src2));
 5452   match(Set dst (AndReductionV  src1 src2));
 5453   match(Set dst ( OrReductionV  src1 src2));
 5454   match(Set dst (XorReductionV  src1 src2));
 5455   match(Set dst (MinReductionV  src1 src2));
 5456   match(Set dst (MaxReductionV  src1 src2));
 5457   effect(TEMP vtmp1, TEMP vtmp2);
 5458   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5459   ins_encode %{
 5460     int opcode = this->ideal_Opcode();
 5461     int vlen = Matcher::vector_length(this, $src2);
 5462     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5463   %}
 5464   ins_pipe( pipe_slow );
 5465 %}
 5466 #endif
 5467 
 5468 // =======================Short Reduction==========================================
 5469 
 5470 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5471   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5472   match(Set dst (AddReductionVI src1 src2));
 5473   match(Set dst (MulReductionVI src1 src2));
 5474   match(Set dst (AndReductionV  src1 src2));
 5475   match(Set dst ( OrReductionV  src1 src2));
 5476   match(Set dst (XorReductionV  src1 src2));
 5477   match(Set dst (MinReductionV  src1 src2));
 5478   match(Set dst (MaxReductionV  src1 src2));
 5479   effect(TEMP vtmp1, TEMP vtmp2);
 5480   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5481   ins_encode %{
 5482     int opcode = this->ideal_Opcode();
 5483     int vlen = Matcher::vector_length(this, $src2);
 5484     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5485   %}
 5486   ins_pipe( pipe_slow );
 5487 %}
 5488 
 5489 // =======================Mul Reduction==========================================
 5490 
 5491 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5492   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5493             Matcher::vector_length(n->in(2)) <= 32); // src2
 5494   match(Set dst (MulReductionVI src1 src2));
 5495   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5496   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5497   ins_encode %{
 5498     int opcode = this->ideal_Opcode();
 5499     int vlen = Matcher::vector_length(this, $src2);
 5500     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5501   %}
 5502   ins_pipe( pipe_slow );
 5503 %}
 5504 
 5505 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5506   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5507             Matcher::vector_length(n->in(2)) == 64); // src2
 5508   match(Set dst (MulReductionVI src1 src2));
 5509   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5510   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5511   ins_encode %{
 5512     int opcode = this->ideal_Opcode();
 5513     int vlen = Matcher::vector_length(this, $src2);
 5514     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5515   %}
 5516   ins_pipe( pipe_slow );
 5517 %}
 5518 
 5519 //--------------------Min/Max Float Reduction --------------------
 5520 // Float Min Reduction
 5521 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5522                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5523   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5524             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5525              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5526             Matcher::vector_length(n->in(2)) == 2);
 5527   match(Set dst (MinReductionV src1 src2));
 5528   match(Set dst (MaxReductionV src1 src2));
 5529   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5530   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5531   ins_encode %{
 5532     assert(UseAVX > 0, "sanity");
 5533 
 5534     int opcode = this->ideal_Opcode();
 5535     int vlen = Matcher::vector_length(this, $src2);
 5536     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5537                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5538   %}
 5539   ins_pipe( pipe_slow );
 5540 %}
 5541 
 5542 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5543                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5544   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5545             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5546              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5547             Matcher::vector_length(n->in(2)) >= 4);
 5548   match(Set dst (MinReductionV src1 src2));
 5549   match(Set dst (MaxReductionV src1 src2));
 5550   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5551   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5552   ins_encode %{
 5553     assert(UseAVX > 0, "sanity");
 5554 
 5555     int opcode = this->ideal_Opcode();
 5556     int vlen = Matcher::vector_length(this, $src2);
 5557     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5558                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5559   %}
 5560   ins_pipe( pipe_slow );
 5561 %}
 5562 
 5563 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5564                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5565   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5566             Matcher::vector_length(n->in(2)) == 2);
 5567   match(Set dst (MinReductionV dst src));
 5568   match(Set dst (MaxReductionV dst src));
 5569   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5570   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5571   ins_encode %{
 5572     assert(UseAVX > 0, "sanity");
 5573 
 5574     int opcode = this->ideal_Opcode();
 5575     int vlen = Matcher::vector_length(this, $src);
 5576     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5577                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5578   %}
 5579   ins_pipe( pipe_slow );
 5580 %}
 5581 
 5582 
 5583 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5584                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5585   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5586             Matcher::vector_length(n->in(2)) >= 4);
 5587   match(Set dst (MinReductionV dst src));
 5588   match(Set dst (MaxReductionV dst src));
 5589   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5590   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5591   ins_encode %{
 5592     assert(UseAVX > 0, "sanity");
 5593 
 5594     int opcode = this->ideal_Opcode();
 5595     int vlen = Matcher::vector_length(this, $src);
 5596     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5597                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5598   %}
 5599   ins_pipe( pipe_slow );
 5600 %}
 5601 
 5602 
 5603 //--------------------Min Double Reduction --------------------
 5604 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5605                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5606                             rFlagsReg cr) %{
 5607   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5608             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5609              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5610             Matcher::vector_length(n->in(2)) == 2);
 5611   match(Set dst (MinReductionV src1 src2));
 5612   match(Set dst (MaxReductionV src1 src2));
 5613   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5614   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5615   ins_encode %{
 5616     assert(UseAVX > 0, "sanity");
 5617 
 5618     int opcode = this->ideal_Opcode();
 5619     int vlen = Matcher::vector_length(this, $src2);
 5620     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5621                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5622   %}
 5623   ins_pipe( pipe_slow );
 5624 %}
 5625 
 5626 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5627                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5628                            rFlagsReg cr) %{
 5629   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5630             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5631              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5632             Matcher::vector_length(n->in(2)) >= 4);
 5633   match(Set dst (MinReductionV src1 src2));
 5634   match(Set dst (MaxReductionV src1 src2));
 5635   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5636   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5637   ins_encode %{
 5638     assert(UseAVX > 0, "sanity");
 5639 
 5640     int opcode = this->ideal_Opcode();
 5641     int vlen = Matcher::vector_length(this, $src2);
 5642     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5643                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5644   %}
 5645   ins_pipe( pipe_slow );
 5646 %}
 5647 
 5648 
 5649 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5650                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5651                                rFlagsReg cr) %{
 5652   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5653             Matcher::vector_length(n->in(2)) == 2);
 5654   match(Set dst (MinReductionV dst src));
 5655   match(Set dst (MaxReductionV dst src));
 5656   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5657   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5658   ins_encode %{
 5659     assert(UseAVX > 0, "sanity");
 5660 
 5661     int opcode = this->ideal_Opcode();
 5662     int vlen = Matcher::vector_length(this, $src);
 5663     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5664                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5665   %}
 5666   ins_pipe( pipe_slow );
 5667 %}
 5668 
 5669 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5670                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5671                               rFlagsReg cr) %{
 5672   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5673             Matcher::vector_length(n->in(2)) >= 4);
 5674   match(Set dst (MinReductionV dst src));
 5675   match(Set dst (MaxReductionV dst src));
 5676   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5677   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5678   ins_encode %{
 5679     assert(UseAVX > 0, "sanity");
 5680 
 5681     int opcode = this->ideal_Opcode();
 5682     int vlen = Matcher::vector_length(this, $src);
 5683     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5684                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5685   %}
 5686   ins_pipe( pipe_slow );
 5687 %}
 5688 
 5689 // ====================VECTOR ARITHMETIC=======================================
 5690 
 5691 // --------------------------------- ADD --------------------------------------
 5692 
 5693 // Bytes vector add
 5694 instruct vaddB(vec dst, vec src) %{
 5695   predicate(UseAVX == 0);
 5696   match(Set dst (AddVB dst src));
 5697   format %{ "paddb   $dst,$src\t! add packedB" %}
 5698   ins_encode %{
 5699     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5700   %}
 5701   ins_pipe( pipe_slow );
 5702 %}
 5703 
 5704 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5705   predicate(UseAVX > 0);
 5706   match(Set dst (AddVB src1 src2));
 5707   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5708   ins_encode %{
 5709     int vlen_enc = vector_length_encoding(this);
 5710     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5711   %}
 5712   ins_pipe( pipe_slow );
 5713 %}
 5714 
 5715 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5716   predicate((UseAVX > 0) &&
 5717             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5718   match(Set dst (AddVB src (LoadVector mem)));
 5719   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5720   ins_encode %{
 5721     int vlen_enc = vector_length_encoding(this);
 5722     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5723   %}
 5724   ins_pipe( pipe_slow );
 5725 %}
 5726 
 5727 // Shorts/Chars vector add
 5728 instruct vaddS(vec dst, vec src) %{
 5729   predicate(UseAVX == 0);
 5730   match(Set dst (AddVS dst src));
 5731   format %{ "paddw   $dst,$src\t! add packedS" %}
 5732   ins_encode %{
 5733     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5734   %}
 5735   ins_pipe( pipe_slow );
 5736 %}
 5737 
 5738 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5739   predicate(UseAVX > 0);
 5740   match(Set dst (AddVS src1 src2));
 5741   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5742   ins_encode %{
 5743     int vlen_enc = vector_length_encoding(this);
 5744     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5745   %}
 5746   ins_pipe( pipe_slow );
 5747 %}
 5748 
 5749 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5750   predicate((UseAVX > 0) &&
 5751             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5752   match(Set dst (AddVS src (LoadVector mem)));
 5753   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5754   ins_encode %{
 5755     int vlen_enc = vector_length_encoding(this);
 5756     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5757   %}
 5758   ins_pipe( pipe_slow );
 5759 %}
 5760 
 5761 // Integers vector add
 5762 instruct vaddI(vec dst, vec src) %{
 5763   predicate(UseAVX == 0);
 5764   match(Set dst (AddVI dst src));
 5765   format %{ "paddd   $dst,$src\t! add packedI" %}
 5766   ins_encode %{
 5767     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5768   %}
 5769   ins_pipe( pipe_slow );
 5770 %}
 5771 
 5772 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5773   predicate(UseAVX > 0);
 5774   match(Set dst (AddVI src1 src2));
 5775   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5776   ins_encode %{
 5777     int vlen_enc = vector_length_encoding(this);
 5778     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5779   %}
 5780   ins_pipe( pipe_slow );
 5781 %}
 5782 
 5783 
 5784 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5785   predicate((UseAVX > 0) &&
 5786             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5787   match(Set dst (AddVI src (LoadVector mem)));
 5788   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5789   ins_encode %{
 5790     int vlen_enc = vector_length_encoding(this);
 5791     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5792   %}
 5793   ins_pipe( pipe_slow );
 5794 %}
 5795 
 5796 // Longs vector add
 5797 instruct vaddL(vec dst, vec src) %{
 5798   predicate(UseAVX == 0);
 5799   match(Set dst (AddVL dst src));
 5800   format %{ "paddq   $dst,$src\t! add packedL" %}
 5801   ins_encode %{
 5802     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5803   %}
 5804   ins_pipe( pipe_slow );
 5805 %}
 5806 
 5807 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5808   predicate(UseAVX > 0);
 5809   match(Set dst (AddVL src1 src2));
 5810   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5811   ins_encode %{
 5812     int vlen_enc = vector_length_encoding(this);
 5813     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5814   %}
 5815   ins_pipe( pipe_slow );
 5816 %}
 5817 
 5818 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5819   predicate((UseAVX > 0) &&
 5820             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5821   match(Set dst (AddVL src (LoadVector mem)));
 5822   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5823   ins_encode %{
 5824     int vlen_enc = vector_length_encoding(this);
 5825     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5826   %}
 5827   ins_pipe( pipe_slow );
 5828 %}
 5829 
 5830 // Floats vector add
 5831 instruct vaddF(vec dst, vec src) %{
 5832   predicate(UseAVX == 0);
 5833   match(Set dst (AddVF dst src));
 5834   format %{ "addps   $dst,$src\t! add packedF" %}
 5835   ins_encode %{
 5836     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5837   %}
 5838   ins_pipe( pipe_slow );
 5839 %}
 5840 
 5841 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5842   predicate(UseAVX > 0);
 5843   match(Set dst (AddVF src1 src2));
 5844   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5845   ins_encode %{
 5846     int vlen_enc = vector_length_encoding(this);
 5847     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5848   %}
 5849   ins_pipe( pipe_slow );
 5850 %}
 5851 
 5852 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5853   predicate((UseAVX > 0) &&
 5854             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5855   match(Set dst (AddVF src (LoadVector mem)));
 5856   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5857   ins_encode %{
 5858     int vlen_enc = vector_length_encoding(this);
 5859     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5860   %}
 5861   ins_pipe( pipe_slow );
 5862 %}
 5863 
 5864 // Doubles vector add
 5865 instruct vaddD(vec dst, vec src) %{
 5866   predicate(UseAVX == 0);
 5867   match(Set dst (AddVD dst src));
 5868   format %{ "addpd   $dst,$src\t! add packedD" %}
 5869   ins_encode %{
 5870     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5871   %}
 5872   ins_pipe( pipe_slow );
 5873 %}
 5874 
 5875 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5876   predicate(UseAVX > 0);
 5877   match(Set dst (AddVD src1 src2));
 5878   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5879   ins_encode %{
 5880     int vlen_enc = vector_length_encoding(this);
 5881     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5882   %}
 5883   ins_pipe( pipe_slow );
 5884 %}
 5885 
 5886 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5887   predicate((UseAVX > 0) &&
 5888             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5889   match(Set dst (AddVD src (LoadVector mem)));
 5890   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5891   ins_encode %{
 5892     int vlen_enc = vector_length_encoding(this);
 5893     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5894   %}
 5895   ins_pipe( pipe_slow );
 5896 %}
 5897 
 5898 // --------------------------------- SUB --------------------------------------
 5899 
 5900 // Bytes vector sub
 5901 instruct vsubB(vec dst, vec src) %{
 5902   predicate(UseAVX == 0);
 5903   match(Set dst (SubVB dst src));
 5904   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5905   ins_encode %{
 5906     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5907   %}
 5908   ins_pipe( pipe_slow );
 5909 %}
 5910 
 5911 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5912   predicate(UseAVX > 0);
 5913   match(Set dst (SubVB src1 src2));
 5914   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5915   ins_encode %{
 5916     int vlen_enc = vector_length_encoding(this);
 5917     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5918   %}
 5919   ins_pipe( pipe_slow );
 5920 %}
 5921 
 5922 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5923   predicate((UseAVX > 0) &&
 5924             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5925   match(Set dst (SubVB src (LoadVector mem)));
 5926   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5927   ins_encode %{
 5928     int vlen_enc = vector_length_encoding(this);
 5929     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5930   %}
 5931   ins_pipe( pipe_slow );
 5932 %}
 5933 
 5934 // Shorts/Chars vector sub
 5935 instruct vsubS(vec dst, vec src) %{
 5936   predicate(UseAVX == 0);
 5937   match(Set dst (SubVS dst src));
 5938   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5939   ins_encode %{
 5940     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5941   %}
 5942   ins_pipe( pipe_slow );
 5943 %}
 5944 
 5945 
 5946 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5947   predicate(UseAVX > 0);
 5948   match(Set dst (SubVS src1 src2));
 5949   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5950   ins_encode %{
 5951     int vlen_enc = vector_length_encoding(this);
 5952     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5953   %}
 5954   ins_pipe( pipe_slow );
 5955 %}
 5956 
 5957 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5958   predicate((UseAVX > 0) &&
 5959             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5960   match(Set dst (SubVS src (LoadVector mem)));
 5961   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5962   ins_encode %{
 5963     int vlen_enc = vector_length_encoding(this);
 5964     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5965   %}
 5966   ins_pipe( pipe_slow );
 5967 %}
 5968 
 5969 // Integers vector sub
 5970 instruct vsubI(vec dst, vec src) %{
 5971   predicate(UseAVX == 0);
 5972   match(Set dst (SubVI dst src));
 5973   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5974   ins_encode %{
 5975     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5976   %}
 5977   ins_pipe( pipe_slow );
 5978 %}
 5979 
 5980 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5981   predicate(UseAVX > 0);
 5982   match(Set dst (SubVI src1 src2));
 5983   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5984   ins_encode %{
 5985     int vlen_enc = vector_length_encoding(this);
 5986     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5987   %}
 5988   ins_pipe( pipe_slow );
 5989 %}
 5990 
 5991 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5992   predicate((UseAVX > 0) &&
 5993             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5994   match(Set dst (SubVI src (LoadVector mem)));
 5995   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5996   ins_encode %{
 5997     int vlen_enc = vector_length_encoding(this);
 5998     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5999   %}
 6000   ins_pipe( pipe_slow );
 6001 %}
 6002 
 6003 // Longs vector sub
 6004 instruct vsubL(vec dst, vec src) %{
 6005   predicate(UseAVX == 0);
 6006   match(Set dst (SubVL dst src));
 6007   format %{ "psubq   $dst,$src\t! sub packedL" %}
 6008   ins_encode %{
 6009     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 6010   %}
 6011   ins_pipe( pipe_slow );
 6012 %}
 6013 
 6014 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 6015   predicate(UseAVX > 0);
 6016   match(Set dst (SubVL src1 src2));
 6017   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 6018   ins_encode %{
 6019     int vlen_enc = vector_length_encoding(this);
 6020     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6021   %}
 6022   ins_pipe( pipe_slow );
 6023 %}
 6024 
 6025 
 6026 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 6027   predicate((UseAVX > 0) &&
 6028             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6029   match(Set dst (SubVL src (LoadVector mem)));
 6030   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 6031   ins_encode %{
 6032     int vlen_enc = vector_length_encoding(this);
 6033     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6034   %}
 6035   ins_pipe( pipe_slow );
 6036 %}
 6037 
 6038 // Floats vector sub
 6039 instruct vsubF(vec dst, vec src) %{
 6040   predicate(UseAVX == 0);
 6041   match(Set dst (SubVF dst src));
 6042   format %{ "subps   $dst,$src\t! sub packedF" %}
 6043   ins_encode %{
 6044     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 6045   %}
 6046   ins_pipe( pipe_slow );
 6047 %}
 6048 
 6049 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 6050   predicate(UseAVX > 0);
 6051   match(Set dst (SubVF src1 src2));
 6052   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 6053   ins_encode %{
 6054     int vlen_enc = vector_length_encoding(this);
 6055     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6056   %}
 6057   ins_pipe( pipe_slow );
 6058 %}
 6059 
 6060 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 6061   predicate((UseAVX > 0) &&
 6062             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6063   match(Set dst (SubVF src (LoadVector mem)));
 6064   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 6065   ins_encode %{
 6066     int vlen_enc = vector_length_encoding(this);
 6067     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6068   %}
 6069   ins_pipe( pipe_slow );
 6070 %}
 6071 
 6072 // Doubles vector sub
 6073 instruct vsubD(vec dst, vec src) %{
 6074   predicate(UseAVX == 0);
 6075   match(Set dst (SubVD dst src));
 6076   format %{ "subpd   $dst,$src\t! sub packedD" %}
 6077   ins_encode %{
 6078     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 6079   %}
 6080   ins_pipe( pipe_slow );
 6081 %}
 6082 
 6083 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 6084   predicate(UseAVX > 0);
 6085   match(Set dst (SubVD src1 src2));
 6086   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 6087   ins_encode %{
 6088     int vlen_enc = vector_length_encoding(this);
 6089     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6090   %}
 6091   ins_pipe( pipe_slow );
 6092 %}
 6093 
 6094 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 6095   predicate((UseAVX > 0) &&
 6096             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6097   match(Set dst (SubVD src (LoadVector mem)));
 6098   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 6099   ins_encode %{
 6100     int vlen_enc = vector_length_encoding(this);
 6101     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6102   %}
 6103   ins_pipe( pipe_slow );
 6104 %}
 6105 
 6106 // --------------------------------- MUL --------------------------------------
 6107 
 6108 // Byte vector mul
 6109 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 6110   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 6111   match(Set dst (MulVB src1 src2));
 6112   effect(TEMP dst, TEMP xtmp);
 6113   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6114   ins_encode %{
 6115     assert(UseSSE > 3, "required");
 6116     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 6117     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6118     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6119     __ psllw($dst$$XMMRegister, 8);
 6120     __ psrlw($dst$$XMMRegister, 8);
 6121     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6122   %}
 6123   ins_pipe( pipe_slow );
 6124 %}
 6125 
 6126 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 6127   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 6128   match(Set dst (MulVB src1 src2));
 6129   effect(TEMP dst, TEMP xtmp);
 6130   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6131   ins_encode %{
 6132     assert(UseSSE > 3, "required");
 6133     // Odd-index elements
 6134     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 6135     __ psrlw($dst$$XMMRegister, 8);
 6136     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 6137     __ psrlw($xtmp$$XMMRegister, 8);
 6138     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 6139     __ psllw($dst$$XMMRegister, 8);
 6140     // Even-index elements
 6141     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6142     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 6143     __ psllw($xtmp$$XMMRegister, 8);
 6144     __ psrlw($xtmp$$XMMRegister, 8);
 6145     // Combine
 6146     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 6147   %}
 6148   ins_pipe( pipe_slow );
 6149 %}
 6150 
 6151 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6152   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 6153   match(Set dst (MulVB src1 src2));
 6154   effect(TEMP xtmp1, TEMP xtmp2);
 6155   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6156   ins_encode %{
 6157     int vlen_enc = vector_length_encoding(this);
 6158     // Odd-index elements
 6159     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 6160     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 6161     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6162     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 6163     // Even-index elements
 6164     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6165     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6166     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 6167     // Combine
 6168     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6169   %}
 6170   ins_pipe( pipe_slow );
 6171 %}
 6172 
 6173 // Shorts/Chars vector mul
 6174 instruct vmulS(vec dst, vec src) %{
 6175   predicate(UseAVX == 0);
 6176   match(Set dst (MulVS dst src));
 6177   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6178   ins_encode %{
 6179     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6180   %}
 6181   ins_pipe( pipe_slow );
 6182 %}
 6183 
 6184 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6185   predicate(UseAVX > 0);
 6186   match(Set dst (MulVS src1 src2));
 6187   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6188   ins_encode %{
 6189     int vlen_enc = vector_length_encoding(this);
 6190     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6191   %}
 6192   ins_pipe( pipe_slow );
 6193 %}
 6194 
 6195 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6196   predicate((UseAVX > 0) &&
 6197             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6198   match(Set dst (MulVS src (LoadVector mem)));
 6199   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6200   ins_encode %{
 6201     int vlen_enc = vector_length_encoding(this);
 6202     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6203   %}
 6204   ins_pipe( pipe_slow );
 6205 %}
 6206 
 6207 // Integers vector mul
 6208 instruct vmulI(vec dst, vec src) %{
 6209   predicate(UseAVX == 0);
 6210   match(Set dst (MulVI dst src));
 6211   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6212   ins_encode %{
 6213     assert(UseSSE > 3, "required");
 6214     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6215   %}
 6216   ins_pipe( pipe_slow );
 6217 %}
 6218 
 6219 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6220   predicate(UseAVX > 0);
 6221   match(Set dst (MulVI src1 src2));
 6222   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6223   ins_encode %{
 6224     int vlen_enc = vector_length_encoding(this);
 6225     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6226   %}
 6227   ins_pipe( pipe_slow );
 6228 %}
 6229 
 6230 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6231   predicate((UseAVX > 0) &&
 6232             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6233   match(Set dst (MulVI src (LoadVector mem)));
 6234   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6235   ins_encode %{
 6236     int vlen_enc = vector_length_encoding(this);
 6237     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6238   %}
 6239   ins_pipe( pipe_slow );
 6240 %}
 6241 
 6242 // Longs vector mul
 6243 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6244   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6245              VM_Version::supports_avx512dq()) ||
 6246             VM_Version::supports_avx512vldq());
 6247   match(Set dst (MulVL src1 src2));
 6248   ins_cost(500);
 6249   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6250   ins_encode %{
 6251     assert(UseAVX > 2, "required");
 6252     int vlen_enc = vector_length_encoding(this);
 6253     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6254   %}
 6255   ins_pipe( pipe_slow );
 6256 %}
 6257 
 6258 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6259   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6260              VM_Version::supports_avx512dq()) ||
 6261             (Matcher::vector_length_in_bytes(n) > 8 &&
 6262              VM_Version::supports_avx512vldq()));
 6263   match(Set dst (MulVL src (LoadVector mem)));
 6264   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6265   ins_cost(500);
 6266   ins_encode %{
 6267     assert(UseAVX > 2, "required");
 6268     int vlen_enc = vector_length_encoding(this);
 6269     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6270   %}
 6271   ins_pipe( pipe_slow );
 6272 %}
 6273 
 6274 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6275   predicate(UseAVX == 0);
 6276   match(Set dst (MulVL src1 src2));
 6277   ins_cost(500);
 6278   effect(TEMP dst, TEMP xtmp);
 6279   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6280   ins_encode %{
 6281     assert(VM_Version::supports_sse4_1(), "required");
 6282     // Get the lo-hi products, only the lower 32 bits is in concerns
 6283     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6284     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6285     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6286     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6287     __ psllq($dst$$XMMRegister, 32);
 6288     // Get the lo-lo products
 6289     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6290     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6291     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6292   %}
 6293   ins_pipe( pipe_slow );
 6294 %}
 6295 
 6296 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6297   predicate(UseAVX > 0 &&
 6298             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6299               !VM_Version::supports_avx512dq()) ||
 6300              (Matcher::vector_length_in_bytes(n) < 64 &&
 6301               !VM_Version::supports_avx512vldq())));
 6302   match(Set dst (MulVL src1 src2));
 6303   effect(TEMP xtmp1, TEMP xtmp2);
 6304   ins_cost(500);
 6305   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6306   ins_encode %{
 6307     int vlen_enc = vector_length_encoding(this);
 6308     // Get the lo-hi products, only the lower 32 bits is in concerns
 6309     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6310     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6311     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6312     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6313     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6314     // Get the lo-lo products
 6315     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6316     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6317   %}
 6318   ins_pipe( pipe_slow );
 6319 %}
 6320 
 6321 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6322   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6323   match(Set dst (MulVL src1 src2));
 6324   ins_cost(100);
 6325   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6326   ins_encode %{
 6327     int vlen_enc = vector_length_encoding(this);
 6328     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6329   %}
 6330   ins_pipe( pipe_slow );
 6331 %}
 6332 
 6333 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6334   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6335   match(Set dst (MulVL src1 src2));
 6336   ins_cost(100);
 6337   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6338   ins_encode %{
 6339     int vlen_enc = vector_length_encoding(this);
 6340     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6341   %}
 6342   ins_pipe( pipe_slow );
 6343 %}
 6344 
 6345 // Floats vector mul
 6346 instruct vmulF(vec dst, vec src) %{
 6347   predicate(UseAVX == 0);
 6348   match(Set dst (MulVF dst src));
 6349   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6350   ins_encode %{
 6351     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6352   %}
 6353   ins_pipe( pipe_slow );
 6354 %}
 6355 
 6356 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6357   predicate(UseAVX > 0);
 6358   match(Set dst (MulVF src1 src2));
 6359   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6360   ins_encode %{
 6361     int vlen_enc = vector_length_encoding(this);
 6362     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6363   %}
 6364   ins_pipe( pipe_slow );
 6365 %}
 6366 
 6367 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6368   predicate((UseAVX > 0) &&
 6369             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6370   match(Set dst (MulVF src (LoadVector mem)));
 6371   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6372   ins_encode %{
 6373     int vlen_enc = vector_length_encoding(this);
 6374     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6375   %}
 6376   ins_pipe( pipe_slow );
 6377 %}
 6378 
 6379 // Doubles vector mul
 6380 instruct vmulD(vec dst, vec src) %{
 6381   predicate(UseAVX == 0);
 6382   match(Set dst (MulVD dst src));
 6383   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6384   ins_encode %{
 6385     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6386   %}
 6387   ins_pipe( pipe_slow );
 6388 %}
 6389 
 6390 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6391   predicate(UseAVX > 0);
 6392   match(Set dst (MulVD src1 src2));
 6393   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6394   ins_encode %{
 6395     int vlen_enc = vector_length_encoding(this);
 6396     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6397   %}
 6398   ins_pipe( pipe_slow );
 6399 %}
 6400 
 6401 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6402   predicate((UseAVX > 0) &&
 6403             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6404   match(Set dst (MulVD src (LoadVector mem)));
 6405   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6406   ins_encode %{
 6407     int vlen_enc = vector_length_encoding(this);
 6408     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6409   %}
 6410   ins_pipe( pipe_slow );
 6411 %}
 6412 
 6413 // --------------------------------- DIV --------------------------------------
 6414 
 6415 // Floats vector div
 6416 instruct vdivF(vec dst, vec src) %{
 6417   predicate(UseAVX == 0);
 6418   match(Set dst (DivVF dst src));
 6419   format %{ "divps   $dst,$src\t! div packedF" %}
 6420   ins_encode %{
 6421     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6422   %}
 6423   ins_pipe( pipe_slow );
 6424 %}
 6425 
 6426 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6427   predicate(UseAVX > 0);
 6428   match(Set dst (DivVF src1 src2));
 6429   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6430   ins_encode %{
 6431     int vlen_enc = vector_length_encoding(this);
 6432     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6433   %}
 6434   ins_pipe( pipe_slow );
 6435 %}
 6436 
 6437 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6438   predicate((UseAVX > 0) &&
 6439             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6440   match(Set dst (DivVF src (LoadVector mem)));
 6441   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6442   ins_encode %{
 6443     int vlen_enc = vector_length_encoding(this);
 6444     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6445   %}
 6446   ins_pipe( pipe_slow );
 6447 %}
 6448 
 6449 // Doubles vector div
 6450 instruct vdivD(vec dst, vec src) %{
 6451   predicate(UseAVX == 0);
 6452   match(Set dst (DivVD dst src));
 6453   format %{ "divpd   $dst,$src\t! div packedD" %}
 6454   ins_encode %{
 6455     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6456   %}
 6457   ins_pipe( pipe_slow );
 6458 %}
 6459 
 6460 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6461   predicate(UseAVX > 0);
 6462   match(Set dst (DivVD src1 src2));
 6463   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6464   ins_encode %{
 6465     int vlen_enc = vector_length_encoding(this);
 6466     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6467   %}
 6468   ins_pipe( pipe_slow );
 6469 %}
 6470 
 6471 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6472   predicate((UseAVX > 0) &&
 6473             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6474   match(Set dst (DivVD src (LoadVector mem)));
 6475   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6476   ins_encode %{
 6477     int vlen_enc = vector_length_encoding(this);
 6478     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6479   %}
 6480   ins_pipe( pipe_slow );
 6481 %}
 6482 
 6483 // ------------------------------ MinMax ---------------------------------------
 6484 
 6485 // Byte, Short, Int vector Min/Max
 6486 instruct minmax_reg_sse(vec dst, vec src) %{
 6487   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6488             UseAVX == 0);
 6489   match(Set dst (MinV dst src));
 6490   match(Set dst (MaxV dst src));
 6491   format %{ "vector_minmax  $dst,$src\t!  " %}
 6492   ins_encode %{
 6493     assert(UseSSE >= 4, "required");
 6494 
 6495     int opcode = this->ideal_Opcode();
 6496     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6497     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6498   %}
 6499   ins_pipe( pipe_slow );
 6500 %}
 6501 
 6502 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6503   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6504             UseAVX > 0);
 6505   match(Set dst (MinV src1 src2));
 6506   match(Set dst (MaxV src1 src2));
 6507   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6508   ins_encode %{
 6509     int opcode = this->ideal_Opcode();
 6510     int vlen_enc = vector_length_encoding(this);
 6511     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6512 
 6513     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6514   %}
 6515   ins_pipe( pipe_slow );
 6516 %}
 6517 
 6518 // Long vector Min/Max
 6519 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6520   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6521             UseAVX == 0);
 6522   match(Set dst (MinV dst src));
 6523   match(Set dst (MaxV src dst));
 6524   effect(TEMP dst, TEMP tmp);
 6525   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6526   ins_encode %{
 6527     assert(UseSSE >= 4, "required");
 6528 
 6529     int opcode = this->ideal_Opcode();
 6530     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6531     assert(elem_bt == T_LONG, "sanity");
 6532 
 6533     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6534   %}
 6535   ins_pipe( pipe_slow );
 6536 %}
 6537 
 6538 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6539   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6540             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6541   match(Set dst (MinV src1 src2));
 6542   match(Set dst (MaxV src1 src2));
 6543   effect(TEMP dst);
 6544   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6545   ins_encode %{
 6546     int vlen_enc = vector_length_encoding(this);
 6547     int opcode = this->ideal_Opcode();
 6548     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6549     assert(elem_bt == T_LONG, "sanity");
 6550 
 6551     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6552   %}
 6553   ins_pipe( pipe_slow );
 6554 %}
 6555 
 6556 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6557   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6558             Matcher::vector_element_basic_type(n) == T_LONG);
 6559   match(Set dst (MinV src1 src2));
 6560   match(Set dst (MaxV src1 src2));
 6561   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6562   ins_encode %{
 6563     assert(UseAVX > 2, "required");
 6564 
 6565     int vlen_enc = vector_length_encoding(this);
 6566     int opcode = this->ideal_Opcode();
 6567     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6568     assert(elem_bt == T_LONG, "sanity");
 6569 
 6570     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6571   %}
 6572   ins_pipe( pipe_slow );
 6573 %}
 6574 
 6575 // Float/Double vector Min/Max
 6576 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6577   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6578             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6579             UseAVX > 0);
 6580   match(Set dst (MinV a b));
 6581   match(Set dst (MaxV a b));
 6582   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6583   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6584   ins_encode %{
 6585     assert(UseAVX > 0, "required");
 6586 
 6587     int opcode = this->ideal_Opcode();
 6588     int vlen_enc = vector_length_encoding(this);
 6589     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6590 
 6591     __ vminmax_fp(opcode, elem_bt,
 6592                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6593                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6594   %}
 6595   ins_pipe( pipe_slow );
 6596 %}
 6597 
 6598 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6599   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6600             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6601   match(Set dst (MinV a b));
 6602   match(Set dst (MaxV a b));
 6603   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6604   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6605   ins_encode %{
 6606     assert(UseAVX > 2, "required");
 6607 
 6608     int opcode = this->ideal_Opcode();
 6609     int vlen_enc = vector_length_encoding(this);
 6610     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6611 
 6612     __ evminmax_fp(opcode, elem_bt,
 6613                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6614                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6615   %}
 6616   ins_pipe( pipe_slow );
 6617 %}
 6618 
 6619 // ------------------------------ Unsigned vector Min/Max ----------------------
 6620 
 6621 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6622   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6623   match(Set dst (UMinV a b));
 6624   match(Set dst (UMaxV a b));
 6625   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6626   ins_encode %{
 6627     int opcode = this->ideal_Opcode();
 6628     int vlen_enc = vector_length_encoding(this);
 6629     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6630     assert(is_integral_type(elem_bt), "");
 6631     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6632   %}
 6633   ins_pipe( pipe_slow );
 6634 %}
 6635 
 6636 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6637   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6638   match(Set dst (UMinV a (LoadVector b)));
 6639   match(Set dst (UMaxV a (LoadVector b)));
 6640   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6641   ins_encode %{
 6642     int opcode = this->ideal_Opcode();
 6643     int vlen_enc = vector_length_encoding(this);
 6644     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6645     assert(is_integral_type(elem_bt), "");
 6646     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6647   %}
 6648   ins_pipe( pipe_slow );
 6649 %}
 6650 
 6651 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6652   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6653   match(Set dst (UMinV a b));
 6654   match(Set dst (UMaxV a b));
 6655   effect(TEMP xtmp1, TEMP xtmp2);
 6656   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6657   ins_encode %{
 6658     int opcode = this->ideal_Opcode();
 6659     int vlen_enc = vector_length_encoding(this);
 6660     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6661   %}
 6662   ins_pipe( pipe_slow );
 6663 %}
 6664 
 6665 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6666   match(Set dst (UMinV (Binary dst src2) mask));
 6667   match(Set dst (UMaxV (Binary dst src2) mask));
 6668   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6669   ins_encode %{
 6670     int vlen_enc = vector_length_encoding(this);
 6671     BasicType bt = Matcher::vector_element_basic_type(this);
 6672     int opc = this->ideal_Opcode();
 6673     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6674                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6675   %}
 6676   ins_pipe( pipe_slow );
 6677 %}
 6678 
 6679 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6680   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6681   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6682   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6683   ins_encode %{
 6684     int vlen_enc = vector_length_encoding(this);
 6685     BasicType bt = Matcher::vector_element_basic_type(this);
 6686     int opc = this->ideal_Opcode();
 6687     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6688                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6689   %}
 6690   ins_pipe( pipe_slow );
 6691 %}
 6692 
 6693 // --------------------------------- Signum/CopySign ---------------------------
 6694 
 6695 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6696   match(Set dst (SignumF dst (Binary zero one)));
 6697   effect(KILL cr);
 6698   format %{ "signumF $dst, $dst" %}
 6699   ins_encode %{
 6700     int opcode = this->ideal_Opcode();
 6701     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6702   %}
 6703   ins_pipe( pipe_slow );
 6704 %}
 6705 
 6706 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6707   match(Set dst (SignumD dst (Binary zero one)));
 6708   effect(KILL cr);
 6709   format %{ "signumD $dst, $dst" %}
 6710   ins_encode %{
 6711     int opcode = this->ideal_Opcode();
 6712     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6713   %}
 6714   ins_pipe( pipe_slow );
 6715 %}
 6716 
 6717 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6718   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6719   match(Set dst (SignumVF src (Binary zero one)));
 6720   match(Set dst (SignumVD src (Binary zero one)));
 6721   effect(TEMP dst, TEMP xtmp1);
 6722   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6723   ins_encode %{
 6724     int opcode = this->ideal_Opcode();
 6725     int vec_enc = vector_length_encoding(this);
 6726     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6727                          $xtmp1$$XMMRegister, vec_enc);
 6728   %}
 6729   ins_pipe( pipe_slow );
 6730 %}
 6731 
 6732 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6733   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6734   match(Set dst (SignumVF src (Binary zero one)));
 6735   match(Set dst (SignumVD src (Binary zero one)));
 6736   effect(TEMP dst, TEMP ktmp1);
 6737   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6738   ins_encode %{
 6739     int opcode = this->ideal_Opcode();
 6740     int vec_enc = vector_length_encoding(this);
 6741     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6742                           $ktmp1$$KRegister, vec_enc);
 6743   %}
 6744   ins_pipe( pipe_slow );
 6745 %}
 6746 
 6747 // ---------------------------------------
 6748 // For copySign use 0xE4 as writemask for vpternlog
 6749 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6750 // C (xmm2) is set to 0x7FFFFFFF
 6751 // Wherever xmm2 is 0, we want to pick from B (sign)
 6752 // Wherever xmm2 is 1, we want to pick from A (src)
 6753 //
 6754 // A B C Result
 6755 // 0 0 0 0
 6756 // 0 0 1 0
 6757 // 0 1 0 1
 6758 // 0 1 1 0
 6759 // 1 0 0 0
 6760 // 1 0 1 1
 6761 // 1 1 0 1
 6762 // 1 1 1 1
 6763 //
 6764 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6765 // ---------------------------------------
 6766 
 6767 #ifdef _LP64
 6768 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6769   match(Set dst (CopySignF dst src));
 6770   effect(TEMP tmp1, TEMP tmp2);
 6771   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6772   ins_encode %{
 6773     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6774     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6775     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6776   %}
 6777   ins_pipe( pipe_slow );
 6778 %}
 6779 
 6780 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6781   match(Set dst (CopySignD dst (Binary src zero)));
 6782   ins_cost(100);
 6783   effect(TEMP tmp1, TEMP tmp2);
 6784   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6785   ins_encode %{
 6786     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6787     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6788     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6789   %}
 6790   ins_pipe( pipe_slow );
 6791 %}
 6792 
 6793 #endif // _LP64
 6794 
 6795 //----------------------------- CompressBits/ExpandBits ------------------------
 6796 
 6797 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6798   predicate(n->bottom_type()->isa_int());
 6799   match(Set dst (CompressBits src mask));
 6800   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6801   ins_encode %{
 6802     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6803   %}
 6804   ins_pipe( pipe_slow );
 6805 %}
 6806 
 6807 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6808   predicate(n->bottom_type()->isa_int());
 6809   match(Set dst (ExpandBits src mask));
 6810   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6811   ins_encode %{
 6812     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6813   %}
 6814   ins_pipe( pipe_slow );
 6815 %}
 6816 
 6817 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6818   predicate(n->bottom_type()->isa_int());
 6819   match(Set dst (CompressBits src (LoadI mask)));
 6820   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6821   ins_encode %{
 6822     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6823   %}
 6824   ins_pipe( pipe_slow );
 6825 %}
 6826 
 6827 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6828   predicate(n->bottom_type()->isa_int());
 6829   match(Set dst (ExpandBits src (LoadI mask)));
 6830   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6831   ins_encode %{
 6832     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6833   %}
 6834   ins_pipe( pipe_slow );
 6835 %}
 6836 
 6837 // --------------------------------- Sqrt --------------------------------------
 6838 
 6839 instruct vsqrtF_reg(vec dst, vec src) %{
 6840   match(Set dst (SqrtVF src));
 6841   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6842   ins_encode %{
 6843     assert(UseAVX > 0, "required");
 6844     int vlen_enc = vector_length_encoding(this);
 6845     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6846   %}
 6847   ins_pipe( pipe_slow );
 6848 %}
 6849 
 6850 instruct vsqrtF_mem(vec dst, memory mem) %{
 6851   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6852   match(Set dst (SqrtVF (LoadVector mem)));
 6853   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6854   ins_encode %{
 6855     assert(UseAVX > 0, "required");
 6856     int vlen_enc = vector_length_encoding(this);
 6857     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6858   %}
 6859   ins_pipe( pipe_slow );
 6860 %}
 6861 
 6862 // Floating point vector sqrt
 6863 instruct vsqrtD_reg(vec dst, vec src) %{
 6864   match(Set dst (SqrtVD src));
 6865   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6866   ins_encode %{
 6867     assert(UseAVX > 0, "required");
 6868     int vlen_enc = vector_length_encoding(this);
 6869     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6870   %}
 6871   ins_pipe( pipe_slow );
 6872 %}
 6873 
 6874 instruct vsqrtD_mem(vec dst, memory mem) %{
 6875   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6876   match(Set dst (SqrtVD (LoadVector mem)));
 6877   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6878   ins_encode %{
 6879     assert(UseAVX > 0, "required");
 6880     int vlen_enc = vector_length_encoding(this);
 6881     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6882   %}
 6883   ins_pipe( pipe_slow );
 6884 %}
 6885 
 6886 // ------------------------------ Shift ---------------------------------------
 6887 
 6888 // Left and right shift count vectors are the same on x86
 6889 // (only lowest bits of xmm reg are used for count).
 6890 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6891   match(Set dst (LShiftCntV cnt));
 6892   match(Set dst (RShiftCntV cnt));
 6893   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6894   ins_encode %{
 6895     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6896   %}
 6897   ins_pipe( pipe_slow );
 6898 %}
 6899 
 6900 // Byte vector shift
 6901 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6902   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6903   match(Set dst ( LShiftVB src shift));
 6904   match(Set dst ( RShiftVB src shift));
 6905   match(Set dst (URShiftVB src shift));
 6906   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6907   format %{"vector_byte_shift $dst,$src,$shift" %}
 6908   ins_encode %{
 6909     assert(UseSSE > 3, "required");
 6910     int opcode = this->ideal_Opcode();
 6911     bool sign = (opcode != Op_URShiftVB);
 6912     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6913     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6914     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6915     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6916     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6917   %}
 6918   ins_pipe( pipe_slow );
 6919 %}
 6920 
 6921 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6922   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6923             UseAVX <= 1);
 6924   match(Set dst ( LShiftVB src shift));
 6925   match(Set dst ( RShiftVB src shift));
 6926   match(Set dst (URShiftVB src shift));
 6927   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6928   format %{"vector_byte_shift $dst,$src,$shift" %}
 6929   ins_encode %{
 6930     assert(UseSSE > 3, "required");
 6931     int opcode = this->ideal_Opcode();
 6932     bool sign = (opcode != Op_URShiftVB);
 6933     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6934     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6935     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6936     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6937     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6938     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6939     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6940     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6941     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6942   %}
 6943   ins_pipe( pipe_slow );
 6944 %}
 6945 
 6946 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6947   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6948             UseAVX > 1);
 6949   match(Set dst ( LShiftVB src shift));
 6950   match(Set dst ( RShiftVB src shift));
 6951   match(Set dst (URShiftVB src shift));
 6952   effect(TEMP dst, TEMP tmp);
 6953   format %{"vector_byte_shift $dst,$src,$shift" %}
 6954   ins_encode %{
 6955     int opcode = this->ideal_Opcode();
 6956     bool sign = (opcode != Op_URShiftVB);
 6957     int vlen_enc = Assembler::AVX_256bit;
 6958     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6959     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6960     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6961     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6962     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6963   %}
 6964   ins_pipe( pipe_slow );
 6965 %}
 6966 
 6967 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6968   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6969   match(Set dst ( LShiftVB src shift));
 6970   match(Set dst ( RShiftVB src shift));
 6971   match(Set dst (URShiftVB src shift));
 6972   effect(TEMP dst, TEMP tmp);
 6973   format %{"vector_byte_shift $dst,$src,$shift" %}
 6974   ins_encode %{
 6975     assert(UseAVX > 1, "required");
 6976     int opcode = this->ideal_Opcode();
 6977     bool sign = (opcode != Op_URShiftVB);
 6978     int vlen_enc = Assembler::AVX_256bit;
 6979     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6980     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6981     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6982     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6983     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6984     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6985     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6986     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6987     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6988   %}
 6989   ins_pipe( pipe_slow );
 6990 %}
 6991 
 6992 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6993   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6994   match(Set dst ( LShiftVB src shift));
 6995   match(Set dst  (RShiftVB src shift));
 6996   match(Set dst (URShiftVB src shift));
 6997   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6998   format %{"vector_byte_shift $dst,$src,$shift" %}
 6999   ins_encode %{
 7000     assert(UseAVX > 2, "required");
 7001     int opcode = this->ideal_Opcode();
 7002     bool sign = (opcode != Op_URShiftVB);
 7003     int vlen_enc = Assembler::AVX_512bit;
 7004     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 7005     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 7006     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7007     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7008     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7009     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 7010     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7011     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7012     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7013     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 7014     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 7015     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7016   %}
 7017   ins_pipe( pipe_slow );
 7018 %}
 7019 
 7020 // Shorts vector logical right shift produces incorrect Java result
 7021 // for negative data because java code convert short value into int with
 7022 // sign extension before a shift. But char vectors are fine since chars are
 7023 // unsigned values.
 7024 // Shorts/Chars vector left shift
 7025 instruct vshiftS(vec dst, vec src, vec shift) %{
 7026   predicate(!n->as_ShiftV()->is_var_shift());
 7027   match(Set dst ( LShiftVS src shift));
 7028   match(Set dst ( RShiftVS src shift));
 7029   match(Set dst (URShiftVS src shift));
 7030   effect(TEMP dst, USE src, USE shift);
 7031   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 7032   ins_encode %{
 7033     int opcode = this->ideal_Opcode();
 7034     if (UseAVX > 0) {
 7035       int vlen_enc = vector_length_encoding(this);
 7036       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7037     } else {
 7038       int vlen = Matcher::vector_length(this);
 7039       if (vlen == 2) {
 7040         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7041         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7042       } else if (vlen == 4) {
 7043         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7044         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7045       } else {
 7046         assert (vlen == 8, "sanity");
 7047         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7048         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7049       }
 7050     }
 7051   %}
 7052   ins_pipe( pipe_slow );
 7053 %}
 7054 
 7055 // Integers vector left shift
 7056 instruct vshiftI(vec dst, vec src, vec shift) %{
 7057   predicate(!n->as_ShiftV()->is_var_shift());
 7058   match(Set dst ( LShiftVI src shift));
 7059   match(Set dst ( RShiftVI src shift));
 7060   match(Set dst (URShiftVI src shift));
 7061   effect(TEMP dst, USE src, USE shift);
 7062   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 7063   ins_encode %{
 7064     int opcode = this->ideal_Opcode();
 7065     if (UseAVX > 0) {
 7066       int vlen_enc = vector_length_encoding(this);
 7067       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7068     } else {
 7069       int vlen = Matcher::vector_length(this);
 7070       if (vlen == 2) {
 7071         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7072         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7073       } else {
 7074         assert(vlen == 4, "sanity");
 7075         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7076         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7077       }
 7078     }
 7079   %}
 7080   ins_pipe( pipe_slow );
 7081 %}
 7082 
 7083 // Integers vector left constant shift
 7084 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 7085   match(Set dst (LShiftVI src (LShiftCntV shift)));
 7086   match(Set dst (RShiftVI src (RShiftCntV shift)));
 7087   match(Set dst (URShiftVI src (RShiftCntV shift)));
 7088   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 7089   ins_encode %{
 7090     int opcode = this->ideal_Opcode();
 7091     if (UseAVX > 0) {
 7092       int vector_len = vector_length_encoding(this);
 7093       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7094     } else {
 7095       int vlen = Matcher::vector_length(this);
 7096       if (vlen == 2) {
 7097         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 7098         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7099       } else {
 7100         assert(vlen == 4, "sanity");
 7101         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7102         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7103       }
 7104     }
 7105   %}
 7106   ins_pipe( pipe_slow );
 7107 %}
 7108 
 7109 // Longs vector shift
 7110 instruct vshiftL(vec dst, vec src, vec shift) %{
 7111   predicate(!n->as_ShiftV()->is_var_shift());
 7112   match(Set dst ( LShiftVL src shift));
 7113   match(Set dst (URShiftVL src shift));
 7114   effect(TEMP dst, USE src, USE shift);
 7115   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 7116   ins_encode %{
 7117     int opcode = this->ideal_Opcode();
 7118     if (UseAVX > 0) {
 7119       int vlen_enc = vector_length_encoding(this);
 7120       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7121     } else {
 7122       assert(Matcher::vector_length(this) == 2, "");
 7123       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7124       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 7125     }
 7126   %}
 7127   ins_pipe( pipe_slow );
 7128 %}
 7129 
 7130 // Longs vector constant shift
 7131 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 7132   match(Set dst (LShiftVL src (LShiftCntV shift)));
 7133   match(Set dst (URShiftVL src (RShiftCntV shift)));
 7134   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 7135   ins_encode %{
 7136     int opcode = this->ideal_Opcode();
 7137     if (UseAVX > 0) {
 7138       int vector_len = vector_length_encoding(this);
 7139       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 7140     } else {
 7141       assert(Matcher::vector_length(this) == 2, "");
 7142       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7143       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 7144     }
 7145   %}
 7146   ins_pipe( pipe_slow );
 7147 %}
 7148 
 7149 // -------------------ArithmeticRightShift -----------------------------------
 7150 // Long vector arithmetic right shift
 7151 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 7152   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 7153   match(Set dst (RShiftVL src shift));
 7154   effect(TEMP dst, TEMP tmp);
 7155   format %{ "vshiftq $dst,$src,$shift" %}
 7156   ins_encode %{
 7157     uint vlen = Matcher::vector_length(this);
 7158     if (vlen == 2) {
 7159       assert(UseSSE >= 2, "required");
 7160       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 7161       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7162       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7163       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7164       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7165       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7166     } else {
 7167       assert(vlen == 4, "sanity");
 7168       assert(UseAVX > 1, "required");
 7169       int vlen_enc = Assembler::AVX_256bit;
 7170       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7171       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7172       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7173       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7174       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7175     }
 7176   %}
 7177   ins_pipe( pipe_slow );
 7178 %}
 7179 
 7180 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7181   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7182   match(Set dst (RShiftVL src shift));
 7183   format %{ "vshiftq $dst,$src,$shift" %}
 7184   ins_encode %{
 7185     int vlen_enc = vector_length_encoding(this);
 7186     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7187   %}
 7188   ins_pipe( pipe_slow );
 7189 %}
 7190 
 7191 // ------------------- Variable Shift -----------------------------
 7192 // Byte variable shift
 7193 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7194   predicate(Matcher::vector_length(n) <= 8 &&
 7195             n->as_ShiftV()->is_var_shift() &&
 7196             !VM_Version::supports_avx512bw());
 7197   match(Set dst ( LShiftVB src shift));
 7198   match(Set dst ( RShiftVB src shift));
 7199   match(Set dst (URShiftVB src shift));
 7200   effect(TEMP dst, TEMP vtmp);
 7201   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7202   ins_encode %{
 7203     assert(UseAVX >= 2, "required");
 7204 
 7205     int opcode = this->ideal_Opcode();
 7206     int vlen_enc = Assembler::AVX_128bit;
 7207     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7208     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7209   %}
 7210   ins_pipe( pipe_slow );
 7211 %}
 7212 
 7213 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7214   predicate(Matcher::vector_length(n) == 16 &&
 7215             n->as_ShiftV()->is_var_shift() &&
 7216             !VM_Version::supports_avx512bw());
 7217   match(Set dst ( LShiftVB src shift));
 7218   match(Set dst ( RShiftVB src shift));
 7219   match(Set dst (URShiftVB src shift));
 7220   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7221   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7222   ins_encode %{
 7223     assert(UseAVX >= 2, "required");
 7224 
 7225     int opcode = this->ideal_Opcode();
 7226     int vlen_enc = Assembler::AVX_128bit;
 7227     // Shift lower half and get word result in dst
 7228     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7229 
 7230     // Shift upper half and get word result in vtmp1
 7231     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7232     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7233     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7234 
 7235     // Merge and down convert the two word results to byte in dst
 7236     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7237   %}
 7238   ins_pipe( pipe_slow );
 7239 %}
 7240 
 7241 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7242   predicate(Matcher::vector_length(n) == 32 &&
 7243             n->as_ShiftV()->is_var_shift() &&
 7244             !VM_Version::supports_avx512bw());
 7245   match(Set dst ( LShiftVB src shift));
 7246   match(Set dst ( RShiftVB src shift));
 7247   match(Set dst (URShiftVB src shift));
 7248   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7249   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7250   ins_encode %{
 7251     assert(UseAVX >= 2, "required");
 7252 
 7253     int opcode = this->ideal_Opcode();
 7254     int vlen_enc = Assembler::AVX_128bit;
 7255     // Process lower 128 bits and get result in dst
 7256     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7257     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7258     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7259     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7260     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7261 
 7262     // Process higher 128 bits and get result in vtmp3
 7263     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7264     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7265     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7266     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7267     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7268     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7269     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7270 
 7271     // Merge the two results in dst
 7272     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7273   %}
 7274   ins_pipe( pipe_slow );
 7275 %}
 7276 
 7277 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7278   predicate(Matcher::vector_length(n) <= 32 &&
 7279             n->as_ShiftV()->is_var_shift() &&
 7280             VM_Version::supports_avx512bw());
 7281   match(Set dst ( LShiftVB src shift));
 7282   match(Set dst ( RShiftVB src shift));
 7283   match(Set dst (URShiftVB src shift));
 7284   effect(TEMP dst, TEMP vtmp);
 7285   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7286   ins_encode %{
 7287     assert(UseAVX > 2, "required");
 7288 
 7289     int opcode = this->ideal_Opcode();
 7290     int vlen_enc = vector_length_encoding(this);
 7291     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7292   %}
 7293   ins_pipe( pipe_slow );
 7294 %}
 7295 
 7296 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7297   predicate(Matcher::vector_length(n) == 64 &&
 7298             n->as_ShiftV()->is_var_shift() &&
 7299             VM_Version::supports_avx512bw());
 7300   match(Set dst ( LShiftVB src shift));
 7301   match(Set dst ( RShiftVB src shift));
 7302   match(Set dst (URShiftVB src shift));
 7303   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7304   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7305   ins_encode %{
 7306     assert(UseAVX > 2, "required");
 7307 
 7308     int opcode = this->ideal_Opcode();
 7309     int vlen_enc = Assembler::AVX_256bit;
 7310     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7311     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7312     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7313     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7314     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7315   %}
 7316   ins_pipe( pipe_slow );
 7317 %}
 7318 
 7319 // Short variable shift
 7320 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7321   predicate(Matcher::vector_length(n) <= 8 &&
 7322             n->as_ShiftV()->is_var_shift() &&
 7323             !VM_Version::supports_avx512bw());
 7324   match(Set dst ( LShiftVS src shift));
 7325   match(Set dst ( RShiftVS src shift));
 7326   match(Set dst (URShiftVS src shift));
 7327   effect(TEMP dst, TEMP vtmp);
 7328   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7329   ins_encode %{
 7330     assert(UseAVX >= 2, "required");
 7331 
 7332     int opcode = this->ideal_Opcode();
 7333     bool sign = (opcode != Op_URShiftVS);
 7334     int vlen_enc = Assembler::AVX_256bit;
 7335     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7336     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7337     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7338     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7339     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7340     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7341   %}
 7342   ins_pipe( pipe_slow );
 7343 %}
 7344 
 7345 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7346   predicate(Matcher::vector_length(n) == 16 &&
 7347             n->as_ShiftV()->is_var_shift() &&
 7348             !VM_Version::supports_avx512bw());
 7349   match(Set dst ( LShiftVS src shift));
 7350   match(Set dst ( RShiftVS src shift));
 7351   match(Set dst (URShiftVS src shift));
 7352   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7353   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7354   ins_encode %{
 7355     assert(UseAVX >= 2, "required");
 7356 
 7357     int opcode = this->ideal_Opcode();
 7358     bool sign = (opcode != Op_URShiftVS);
 7359     int vlen_enc = Assembler::AVX_256bit;
 7360     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7361     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7362     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7363     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7364     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7365 
 7366     // Shift upper half, with result in dst using vtmp1 as TEMP
 7367     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7368     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7369     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7370     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7371     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7372     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7373 
 7374     // Merge lower and upper half result into dst
 7375     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7376     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7377   %}
 7378   ins_pipe( pipe_slow );
 7379 %}
 7380 
 7381 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7382   predicate(n->as_ShiftV()->is_var_shift() &&
 7383             VM_Version::supports_avx512bw());
 7384   match(Set dst ( LShiftVS src shift));
 7385   match(Set dst ( RShiftVS src shift));
 7386   match(Set dst (URShiftVS src shift));
 7387   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7388   ins_encode %{
 7389     assert(UseAVX > 2, "required");
 7390 
 7391     int opcode = this->ideal_Opcode();
 7392     int vlen_enc = vector_length_encoding(this);
 7393     if (!VM_Version::supports_avx512vl()) {
 7394       vlen_enc = Assembler::AVX_512bit;
 7395     }
 7396     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7397   %}
 7398   ins_pipe( pipe_slow );
 7399 %}
 7400 
 7401 //Integer variable shift
 7402 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7403   predicate(n->as_ShiftV()->is_var_shift());
 7404   match(Set dst ( LShiftVI src shift));
 7405   match(Set dst ( RShiftVI src shift));
 7406   match(Set dst (URShiftVI src shift));
 7407   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7408   ins_encode %{
 7409     assert(UseAVX >= 2, "required");
 7410 
 7411     int opcode = this->ideal_Opcode();
 7412     int vlen_enc = vector_length_encoding(this);
 7413     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7414   %}
 7415   ins_pipe( pipe_slow );
 7416 %}
 7417 
 7418 //Long variable shift
 7419 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7420   predicate(n->as_ShiftV()->is_var_shift());
 7421   match(Set dst ( LShiftVL src shift));
 7422   match(Set dst (URShiftVL src shift));
 7423   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7424   ins_encode %{
 7425     assert(UseAVX >= 2, "required");
 7426 
 7427     int opcode = this->ideal_Opcode();
 7428     int vlen_enc = vector_length_encoding(this);
 7429     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7430   %}
 7431   ins_pipe( pipe_slow );
 7432 %}
 7433 
 7434 //Long variable right shift arithmetic
 7435 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7436   predicate(Matcher::vector_length(n) <= 4 &&
 7437             n->as_ShiftV()->is_var_shift() &&
 7438             UseAVX == 2);
 7439   match(Set dst (RShiftVL src shift));
 7440   effect(TEMP dst, TEMP vtmp);
 7441   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7442   ins_encode %{
 7443     int opcode = this->ideal_Opcode();
 7444     int vlen_enc = vector_length_encoding(this);
 7445     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7446                  $vtmp$$XMMRegister);
 7447   %}
 7448   ins_pipe( pipe_slow );
 7449 %}
 7450 
 7451 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7452   predicate(n->as_ShiftV()->is_var_shift() &&
 7453             UseAVX > 2);
 7454   match(Set dst (RShiftVL src shift));
 7455   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7456   ins_encode %{
 7457     int opcode = this->ideal_Opcode();
 7458     int vlen_enc = vector_length_encoding(this);
 7459     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7460   %}
 7461   ins_pipe( pipe_slow );
 7462 %}
 7463 
 7464 // --------------------------------- AND --------------------------------------
 7465 
 7466 instruct vand(vec dst, vec src) %{
 7467   predicate(UseAVX == 0);
 7468   match(Set dst (AndV dst src));
 7469   format %{ "pand    $dst,$src\t! and vectors" %}
 7470   ins_encode %{
 7471     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7472   %}
 7473   ins_pipe( pipe_slow );
 7474 %}
 7475 
 7476 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7477   predicate(UseAVX > 0);
 7478   match(Set dst (AndV src1 src2));
 7479   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7480   ins_encode %{
 7481     int vlen_enc = vector_length_encoding(this);
 7482     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7483   %}
 7484   ins_pipe( pipe_slow );
 7485 %}
 7486 
 7487 instruct vand_mem(vec dst, vec src, memory mem) %{
 7488   predicate((UseAVX > 0) &&
 7489             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7490   match(Set dst (AndV src (LoadVector mem)));
 7491   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7492   ins_encode %{
 7493     int vlen_enc = vector_length_encoding(this);
 7494     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7495   %}
 7496   ins_pipe( pipe_slow );
 7497 %}
 7498 
 7499 // --------------------------------- OR ---------------------------------------
 7500 
 7501 instruct vor(vec dst, vec src) %{
 7502   predicate(UseAVX == 0);
 7503   match(Set dst (OrV dst src));
 7504   format %{ "por     $dst,$src\t! or vectors" %}
 7505   ins_encode %{
 7506     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7507   %}
 7508   ins_pipe( pipe_slow );
 7509 %}
 7510 
 7511 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7512   predicate(UseAVX > 0);
 7513   match(Set dst (OrV src1 src2));
 7514   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7515   ins_encode %{
 7516     int vlen_enc = vector_length_encoding(this);
 7517     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7518   %}
 7519   ins_pipe( pipe_slow );
 7520 %}
 7521 
 7522 instruct vor_mem(vec dst, vec src, memory mem) %{
 7523   predicate((UseAVX > 0) &&
 7524             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7525   match(Set dst (OrV src (LoadVector mem)));
 7526   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7527   ins_encode %{
 7528     int vlen_enc = vector_length_encoding(this);
 7529     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7530   %}
 7531   ins_pipe( pipe_slow );
 7532 %}
 7533 
 7534 // --------------------------------- XOR --------------------------------------
 7535 
 7536 instruct vxor(vec dst, vec src) %{
 7537   predicate(UseAVX == 0);
 7538   match(Set dst (XorV dst src));
 7539   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7540   ins_encode %{
 7541     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7542   %}
 7543   ins_pipe( pipe_slow );
 7544 %}
 7545 
 7546 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7547   predicate(UseAVX > 0);
 7548   match(Set dst (XorV src1 src2));
 7549   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7550   ins_encode %{
 7551     int vlen_enc = vector_length_encoding(this);
 7552     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7553   %}
 7554   ins_pipe( pipe_slow );
 7555 %}
 7556 
 7557 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7558   predicate((UseAVX > 0) &&
 7559             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7560   match(Set dst (XorV src (LoadVector mem)));
 7561   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7562   ins_encode %{
 7563     int vlen_enc = vector_length_encoding(this);
 7564     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7565   %}
 7566   ins_pipe( pipe_slow );
 7567 %}
 7568 
 7569 // --------------------------------- VectorCast --------------------------------------
 7570 
 7571 instruct vcastBtoX(vec dst, vec src) %{
 7572   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7573   match(Set dst (VectorCastB2X src));
 7574   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7575   ins_encode %{
 7576     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7577     int vlen_enc = vector_length_encoding(this);
 7578     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7579   %}
 7580   ins_pipe( pipe_slow );
 7581 %}
 7582 
 7583 instruct vcastBtoD(legVec dst, legVec src) %{
 7584   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7585   match(Set dst (VectorCastB2X src));
 7586   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7587   ins_encode %{
 7588     int vlen_enc = vector_length_encoding(this);
 7589     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7590   %}
 7591   ins_pipe( pipe_slow );
 7592 %}
 7593 
 7594 instruct castStoX(vec dst, vec src) %{
 7595   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7596             Matcher::vector_length(n->in(1)) <= 8 && // src
 7597             Matcher::vector_element_basic_type(n) == T_BYTE);
 7598   match(Set dst (VectorCastS2X src));
 7599   format %{ "vector_cast_s2x $dst,$src" %}
 7600   ins_encode %{
 7601     assert(UseAVX > 0, "required");
 7602 
 7603     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7604     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7605   %}
 7606   ins_pipe( pipe_slow );
 7607 %}
 7608 
 7609 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7610   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7611             Matcher::vector_length(n->in(1)) == 16 && // src
 7612             Matcher::vector_element_basic_type(n) == T_BYTE);
 7613   effect(TEMP dst, TEMP vtmp);
 7614   match(Set dst (VectorCastS2X src));
 7615   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7616   ins_encode %{
 7617     assert(UseAVX > 0, "required");
 7618 
 7619     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7620     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7621     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7622     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7623   %}
 7624   ins_pipe( pipe_slow );
 7625 %}
 7626 
 7627 instruct vcastStoX_evex(vec dst, vec src) %{
 7628   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7629             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7630   match(Set dst (VectorCastS2X src));
 7631   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7632   ins_encode %{
 7633     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7634     int src_vlen_enc = vector_length_encoding(this, $src);
 7635     int vlen_enc = vector_length_encoding(this);
 7636     switch (to_elem_bt) {
 7637       case T_BYTE:
 7638         if (!VM_Version::supports_avx512vl()) {
 7639           vlen_enc = Assembler::AVX_512bit;
 7640         }
 7641         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7642         break;
 7643       case T_INT:
 7644         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7645         break;
 7646       case T_FLOAT:
 7647         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7648         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7649         break;
 7650       case T_LONG:
 7651         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7652         break;
 7653       case T_DOUBLE: {
 7654         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7655         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7656         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7657         break;
 7658       }
 7659       default:
 7660         ShouldNotReachHere();
 7661     }
 7662   %}
 7663   ins_pipe( pipe_slow );
 7664 %}
 7665 
 7666 instruct castItoX(vec dst, vec src) %{
 7667   predicate(UseAVX <= 2 &&
 7668             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7669             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7670   match(Set dst (VectorCastI2X src));
 7671   format %{ "vector_cast_i2x $dst,$src" %}
 7672   ins_encode %{
 7673     assert(UseAVX > 0, "required");
 7674 
 7675     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7676     int vlen_enc = vector_length_encoding(this, $src);
 7677 
 7678     if (to_elem_bt == T_BYTE) {
 7679       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7680       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7681       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7682     } else {
 7683       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7684       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7685       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7686     }
 7687   %}
 7688   ins_pipe( pipe_slow );
 7689 %}
 7690 
 7691 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7692   predicate(UseAVX <= 2 &&
 7693             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7694             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7695   match(Set dst (VectorCastI2X src));
 7696   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7697   effect(TEMP dst, TEMP vtmp);
 7698   ins_encode %{
 7699     assert(UseAVX > 0, "required");
 7700 
 7701     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7702     int vlen_enc = vector_length_encoding(this, $src);
 7703 
 7704     if (to_elem_bt == T_BYTE) {
 7705       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7706       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7707       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7708       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7709     } else {
 7710       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7711       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7712       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7713       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7714     }
 7715   %}
 7716   ins_pipe( pipe_slow );
 7717 %}
 7718 
 7719 instruct vcastItoX_evex(vec dst, vec src) %{
 7720   predicate(UseAVX > 2 ||
 7721             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7722   match(Set dst (VectorCastI2X src));
 7723   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7724   ins_encode %{
 7725     assert(UseAVX > 0, "required");
 7726 
 7727     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7728     int src_vlen_enc = vector_length_encoding(this, $src);
 7729     int dst_vlen_enc = vector_length_encoding(this);
 7730     switch (dst_elem_bt) {
 7731       case T_BYTE:
 7732         if (!VM_Version::supports_avx512vl()) {
 7733           src_vlen_enc = Assembler::AVX_512bit;
 7734         }
 7735         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7736         break;
 7737       case T_SHORT:
 7738         if (!VM_Version::supports_avx512vl()) {
 7739           src_vlen_enc = Assembler::AVX_512bit;
 7740         }
 7741         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7742         break;
 7743       case T_FLOAT:
 7744         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7745         break;
 7746       case T_LONG:
 7747         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7748         break;
 7749       case T_DOUBLE:
 7750         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7751         break;
 7752       default:
 7753         ShouldNotReachHere();
 7754     }
 7755   %}
 7756   ins_pipe( pipe_slow );
 7757 %}
 7758 
 7759 instruct vcastLtoBS(vec dst, vec src) %{
 7760   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7761             UseAVX <= 2);
 7762   match(Set dst (VectorCastL2X src));
 7763   format %{ "vector_cast_l2x  $dst,$src" %}
 7764   ins_encode %{
 7765     assert(UseAVX > 0, "required");
 7766 
 7767     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7768     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7769     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7770                                                       : ExternalAddress(vector_int_to_short_mask());
 7771     if (vlen <= 16) {
 7772       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7773       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7774       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7775     } else {
 7776       assert(vlen <= 32, "required");
 7777       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7778       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7779       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7780       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7781     }
 7782     if (to_elem_bt == T_BYTE) {
 7783       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7784     }
 7785   %}
 7786   ins_pipe( pipe_slow );
 7787 %}
 7788 
 7789 instruct vcastLtoX_evex(vec dst, vec src) %{
 7790   predicate(UseAVX > 2 ||
 7791             (Matcher::vector_element_basic_type(n) == T_INT ||
 7792              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7793              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7794   match(Set dst (VectorCastL2X src));
 7795   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7796   ins_encode %{
 7797     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7798     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7799     int vlen_enc = vector_length_encoding(this, $src);
 7800     switch (to_elem_bt) {
 7801       case T_BYTE:
 7802         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7803           vlen_enc = Assembler::AVX_512bit;
 7804         }
 7805         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7806         break;
 7807       case T_SHORT:
 7808         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7809           vlen_enc = Assembler::AVX_512bit;
 7810         }
 7811         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7812         break;
 7813       case T_INT:
 7814         if (vlen == 8) {
 7815           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7816             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7817           }
 7818         } else if (vlen == 16) {
 7819           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7820         } else if (vlen == 32) {
 7821           if (UseAVX > 2) {
 7822             if (!VM_Version::supports_avx512vl()) {
 7823               vlen_enc = Assembler::AVX_512bit;
 7824             }
 7825             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7826           } else {
 7827             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7828             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7829           }
 7830         } else { // vlen == 64
 7831           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7832         }
 7833         break;
 7834       case T_FLOAT:
 7835         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7836         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7837         break;
 7838       case T_DOUBLE:
 7839         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7840         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7841         break;
 7842 
 7843       default: assert(false, "%s", type2name(to_elem_bt));
 7844     }
 7845   %}
 7846   ins_pipe( pipe_slow );
 7847 %}
 7848 
 7849 instruct vcastFtoD_reg(vec dst, vec src) %{
 7850   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7851   match(Set dst (VectorCastF2X src));
 7852   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7853   ins_encode %{
 7854     int vlen_enc = vector_length_encoding(this);
 7855     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7856   %}
 7857   ins_pipe( pipe_slow );
 7858 %}
 7859 
 7860 
 7861 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7862   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7863             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7864   match(Set dst (VectorCastF2X src));
 7865   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7866   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7867   ins_encode %{
 7868     int vlen_enc = vector_length_encoding(this, $src);
 7869     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7870     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7871     // 32 bit addresses for register indirect addressing mode since stub constants
 7872     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7873     // However, targets are free to increase this limit, but having a large code cache size
 7874     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7875     // cap we save a temporary register allocation which in limiting case can prevent
 7876     // spilling in high register pressure blocks.
 7877     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7878                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7879                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7880   %}
 7881   ins_pipe( pipe_slow );
 7882 %}
 7883 
 7884 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7885   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7886             is_integral_type(Matcher::vector_element_basic_type(n)));
 7887   match(Set dst (VectorCastF2X src));
 7888   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7889   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7890   ins_encode %{
 7891     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7892     if (to_elem_bt == T_LONG) {
 7893       int vlen_enc = vector_length_encoding(this);
 7894       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7895                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7896                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7897     } else {
 7898       int vlen_enc = vector_length_encoding(this, $src);
 7899       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7900                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7901                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7902     }
 7903   %}
 7904   ins_pipe( pipe_slow );
 7905 %}
 7906 
 7907 instruct vcastDtoF_reg(vec dst, vec src) %{
 7908   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7909   match(Set dst (VectorCastD2X src));
 7910   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7911   ins_encode %{
 7912     int vlen_enc = vector_length_encoding(this, $src);
 7913     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7914   %}
 7915   ins_pipe( pipe_slow );
 7916 %}
 7917 
 7918 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7919   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7920             is_integral_type(Matcher::vector_element_basic_type(n)));
 7921   match(Set dst (VectorCastD2X src));
 7922   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7923   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7924   ins_encode %{
 7925     int vlen_enc = vector_length_encoding(this, $src);
 7926     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7927     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7928                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7929                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7930   %}
 7931   ins_pipe( pipe_slow );
 7932 %}
 7933 
 7934 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7935   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7936             is_integral_type(Matcher::vector_element_basic_type(n)));
 7937   match(Set dst (VectorCastD2X src));
 7938   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7939   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7940   ins_encode %{
 7941     int vlen_enc = vector_length_encoding(this, $src);
 7942     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7943     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7944                               ExternalAddress(vector_float_signflip());
 7945     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7946                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7947   %}
 7948   ins_pipe( pipe_slow );
 7949 %}
 7950 
 7951 instruct vucast(vec dst, vec src) %{
 7952   match(Set dst (VectorUCastB2X src));
 7953   match(Set dst (VectorUCastS2X src));
 7954   match(Set dst (VectorUCastI2X src));
 7955   format %{ "vector_ucast $dst,$src\t!" %}
 7956   ins_encode %{
 7957     assert(UseAVX > 0, "required");
 7958 
 7959     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7960     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7961     int vlen_enc = vector_length_encoding(this);
 7962     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7963   %}
 7964   ins_pipe( pipe_slow );
 7965 %}
 7966 
 7967 #ifdef _LP64
 7968 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7969   predicate(!VM_Version::supports_avx512vl() &&
 7970             Matcher::vector_length_in_bytes(n) < 64 &&
 7971             Matcher::vector_element_basic_type(n) == T_INT);
 7972   match(Set dst (RoundVF src));
 7973   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7974   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7975   ins_encode %{
 7976     int vlen_enc = vector_length_encoding(this);
 7977     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7978     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7979                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7980                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7981   %}
 7982   ins_pipe( pipe_slow );
 7983 %}
 7984 
 7985 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7986   predicate((VM_Version::supports_avx512vl() ||
 7987              Matcher::vector_length_in_bytes(n) == 64) &&
 7988              Matcher::vector_element_basic_type(n) == T_INT);
 7989   match(Set dst (RoundVF src));
 7990   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7991   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7992   ins_encode %{
 7993     int vlen_enc = vector_length_encoding(this);
 7994     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7995     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7996                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7997                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7998   %}
 7999   ins_pipe( pipe_slow );
 8000 %}
 8001 
 8002 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 8003   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 8004   match(Set dst (RoundVD src));
 8005   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 8006   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 8007   ins_encode %{
 8008     int vlen_enc = vector_length_encoding(this);
 8009     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 8010     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 8011                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 8012                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 8013   %}
 8014   ins_pipe( pipe_slow );
 8015 %}
 8016 
 8017 #endif // _LP64
 8018 
 8019 // --------------------------------- VectorMaskCmp --------------------------------------
 8020 
 8021 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 8022   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8023             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 8024             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8025             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8026   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8027   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 8028   ins_encode %{
 8029     int vlen_enc = vector_length_encoding(this, $src1);
 8030     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8031     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8032       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8033     } else {
 8034       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8035     }
 8036   %}
 8037   ins_pipe( pipe_slow );
 8038 %}
 8039 
 8040 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8041   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 8042             n->bottom_type()->isa_vectmask() == nullptr &&
 8043             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8044   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8045   effect(TEMP ktmp);
 8046   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8047   ins_encode %{
 8048     int vlen_enc = Assembler::AVX_512bit;
 8049     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8050     KRegister mask = k0; // The comparison itself is not being masked.
 8051     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8052       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8053       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 8054     } else {
 8055       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8056       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 8057     }
 8058   %}
 8059   ins_pipe( pipe_slow );
 8060 %}
 8061 
 8062 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 8063   predicate(n->bottom_type()->isa_vectmask() &&
 8064             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 8065   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8066   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 8067   ins_encode %{
 8068     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8069     int vlen_enc = vector_length_encoding(this, $src1);
 8070     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 8071     KRegister mask = k0; // The comparison itself is not being masked.
 8072     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 8073       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8074     } else {
 8075       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 8076     }
 8077   %}
 8078   ins_pipe( pipe_slow );
 8079 %}
 8080 
 8081 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 8082   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8083             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8084             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8085             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8086             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8087             (n->in(2)->get_int() == BoolTest::eq ||
 8088              n->in(2)->get_int() == BoolTest::lt ||
 8089              n->in(2)->get_int() == BoolTest::gt)); // cond
 8090   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8091   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 8092   ins_encode %{
 8093     int vlen_enc = vector_length_encoding(this, $src1);
 8094     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8095     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8096     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 8097   %}
 8098   ins_pipe( pipe_slow );
 8099 %}
 8100 
 8101 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8102   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8103             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8104             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8105             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8106             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8107             (n->in(2)->get_int() == BoolTest::ne ||
 8108              n->in(2)->get_int() == BoolTest::le ||
 8109              n->in(2)->get_int() == BoolTest::ge)); // cond
 8110   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8111   effect(TEMP dst, TEMP xtmp);
 8112   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8113   ins_encode %{
 8114     int vlen_enc = vector_length_encoding(this, $src1);
 8115     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8116     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8117     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8118   %}
 8119   ins_pipe( pipe_slow );
 8120 %}
 8121 
 8122 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8123   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8124             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8125             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8126             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8127             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8128   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8129   effect(TEMP dst, TEMP xtmp);
 8130   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8131   ins_encode %{
 8132     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 8133     int vlen_enc = vector_length_encoding(this, $src1);
 8134     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8135     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8136 
 8137     if (vlen_enc == Assembler::AVX_128bit) {
 8138       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8139     } else {
 8140       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8141     }
 8142     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8143     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8144     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8145   %}
 8146   ins_pipe( pipe_slow );
 8147 %}
 8148 
 8149 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8150   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8151              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8152              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8153   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8154   effect(TEMP ktmp);
 8155   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8156   ins_encode %{
 8157     assert(UseAVX > 2, "required");
 8158 
 8159     int vlen_enc = vector_length_encoding(this, $src1);
 8160     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8161     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8162     KRegister mask = k0; // The comparison itself is not being masked.
 8163     bool merge = false;
 8164     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8165 
 8166     switch (src1_elem_bt) {
 8167       case T_INT: {
 8168         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8169         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8170         break;
 8171       }
 8172       case T_LONG: {
 8173         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8174         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8175         break;
 8176       }
 8177       default: assert(false, "%s", type2name(src1_elem_bt));
 8178     }
 8179   %}
 8180   ins_pipe( pipe_slow );
 8181 %}
 8182 
 8183 
 8184 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8185   predicate(n->bottom_type()->isa_vectmask() &&
 8186             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8187   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8188   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8189   ins_encode %{
 8190     assert(UseAVX > 2, "required");
 8191     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8192 
 8193     int vlen_enc = vector_length_encoding(this, $src1);
 8194     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8195     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8196     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8197 
 8198     // Comparison i
 8199     switch (src1_elem_bt) {
 8200       case T_BYTE: {
 8201         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8202         break;
 8203       }
 8204       case T_SHORT: {
 8205         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8206         break;
 8207       }
 8208       case T_INT: {
 8209         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8210         break;
 8211       }
 8212       case T_LONG: {
 8213         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8214         break;
 8215       }
 8216       default: assert(false, "%s", type2name(src1_elem_bt));
 8217     }
 8218   %}
 8219   ins_pipe( pipe_slow );
 8220 %}
 8221 
 8222 // Extract
 8223 
 8224 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8225   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8226   match(Set dst (ExtractI src idx));
 8227   match(Set dst (ExtractS src idx));
 8228 #ifdef _LP64
 8229   match(Set dst (ExtractB src idx));
 8230 #endif
 8231   format %{ "extractI $dst,$src,$idx\t!" %}
 8232   ins_encode %{
 8233     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8234 
 8235     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8236     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8237   %}
 8238   ins_pipe( pipe_slow );
 8239 %}
 8240 
 8241 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8242   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8243             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8244   match(Set dst (ExtractI src idx));
 8245   match(Set dst (ExtractS src idx));
 8246 #ifdef _LP64
 8247   match(Set dst (ExtractB src idx));
 8248 #endif
 8249   effect(TEMP vtmp);
 8250   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8251   ins_encode %{
 8252     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8253 
 8254     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8255     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8256     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8257   %}
 8258   ins_pipe( pipe_slow );
 8259 %}
 8260 
 8261 #ifdef _LP64
 8262 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8263   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8264   match(Set dst (ExtractL src idx));
 8265   format %{ "extractL $dst,$src,$idx\t!" %}
 8266   ins_encode %{
 8267     assert(UseSSE >= 4, "required");
 8268     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8269 
 8270     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8271   %}
 8272   ins_pipe( pipe_slow );
 8273 %}
 8274 
 8275 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8276   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8277             Matcher::vector_length(n->in(1)) == 8);  // src
 8278   match(Set dst (ExtractL src idx));
 8279   effect(TEMP vtmp);
 8280   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8281   ins_encode %{
 8282     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8283 
 8284     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8285     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8286   %}
 8287   ins_pipe( pipe_slow );
 8288 %}
 8289 #endif
 8290 
 8291 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8292   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8293   match(Set dst (ExtractF src idx));
 8294   effect(TEMP dst, TEMP vtmp);
 8295   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8296   ins_encode %{
 8297     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8298 
 8299     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8300   %}
 8301   ins_pipe( pipe_slow );
 8302 %}
 8303 
 8304 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8305   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8306             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8307   match(Set dst (ExtractF src idx));
 8308   effect(TEMP vtmp);
 8309   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8310   ins_encode %{
 8311     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8312 
 8313     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8314     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8315   %}
 8316   ins_pipe( pipe_slow );
 8317 %}
 8318 
 8319 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8320   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8321   match(Set dst (ExtractD src idx));
 8322   format %{ "extractD $dst,$src,$idx\t!" %}
 8323   ins_encode %{
 8324     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8325 
 8326     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8327   %}
 8328   ins_pipe( pipe_slow );
 8329 %}
 8330 
 8331 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8332   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8333             Matcher::vector_length(n->in(1)) == 8);  // src
 8334   match(Set dst (ExtractD src idx));
 8335   effect(TEMP vtmp);
 8336   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8337   ins_encode %{
 8338     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8339 
 8340     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8341     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8342   %}
 8343   ins_pipe( pipe_slow );
 8344 %}
 8345 
 8346 // --------------------------------- Vector Blend --------------------------------------
 8347 
 8348 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8349   predicate(UseAVX == 0);
 8350   match(Set dst (VectorBlend (Binary dst src) mask));
 8351   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8352   effect(TEMP tmp);
 8353   ins_encode %{
 8354     assert(UseSSE >= 4, "required");
 8355 
 8356     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8357       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8358     }
 8359     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8360   %}
 8361   ins_pipe( pipe_slow );
 8362 %}
 8363 
 8364 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8365   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8366             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8367             Matcher::vector_length_in_bytes(n) <= 32 &&
 8368             is_integral_type(Matcher::vector_element_basic_type(n)));
 8369   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8370   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8371   ins_encode %{
 8372     int vlen_enc = vector_length_encoding(this);
 8373     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8374   %}
 8375   ins_pipe( pipe_slow );
 8376 %}
 8377 
 8378 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8379   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8380             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8381             Matcher::vector_length_in_bytes(n) <= 32 &&
 8382             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8383   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8384   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8385   ins_encode %{
 8386     int vlen_enc = vector_length_encoding(this);
 8387     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8388   %}
 8389   ins_pipe( pipe_slow );
 8390 %}
 8391 
 8392 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8393   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8394             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8395             Matcher::vector_length_in_bytes(n) <= 32);
 8396   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8397   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8398   effect(TEMP vtmp, TEMP dst);
 8399   ins_encode %{
 8400     int vlen_enc = vector_length_encoding(this);
 8401     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8402     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8403     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8404   %}
 8405   ins_pipe( pipe_slow );
 8406 %}
 8407 
 8408 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8409   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8410             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8411   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8412   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8413   effect(TEMP ktmp);
 8414   ins_encode %{
 8415      int vlen_enc = Assembler::AVX_512bit;
 8416      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8417     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8418     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8419   %}
 8420   ins_pipe( pipe_slow );
 8421 %}
 8422 
 8423 
 8424 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8425   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8426             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8427              VM_Version::supports_avx512bw()));
 8428   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8429   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8430   ins_encode %{
 8431     int vlen_enc = vector_length_encoding(this);
 8432     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8433     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8434   %}
 8435   ins_pipe( pipe_slow );
 8436 %}
 8437 
 8438 // --------------------------------- ABS --------------------------------------
 8439 // a = |a|
 8440 instruct vabsB_reg(vec dst, vec src) %{
 8441   match(Set dst (AbsVB  src));
 8442   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8443   ins_encode %{
 8444     uint vlen = Matcher::vector_length(this);
 8445     if (vlen <= 16) {
 8446       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8447     } else {
 8448       int vlen_enc = vector_length_encoding(this);
 8449       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8450     }
 8451   %}
 8452   ins_pipe( pipe_slow );
 8453 %}
 8454 
 8455 instruct vabsS_reg(vec dst, vec src) %{
 8456   match(Set dst (AbsVS  src));
 8457   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8458   ins_encode %{
 8459     uint vlen = Matcher::vector_length(this);
 8460     if (vlen <= 8) {
 8461       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8462     } else {
 8463       int vlen_enc = vector_length_encoding(this);
 8464       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8465     }
 8466   %}
 8467   ins_pipe( pipe_slow );
 8468 %}
 8469 
 8470 instruct vabsI_reg(vec dst, vec src) %{
 8471   match(Set dst (AbsVI  src));
 8472   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8473   ins_encode %{
 8474     uint vlen = Matcher::vector_length(this);
 8475     if (vlen <= 4) {
 8476       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8477     } else {
 8478       int vlen_enc = vector_length_encoding(this);
 8479       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8480     }
 8481   %}
 8482   ins_pipe( pipe_slow );
 8483 %}
 8484 
 8485 instruct vabsL_reg(vec dst, vec src) %{
 8486   match(Set dst (AbsVL  src));
 8487   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8488   ins_encode %{
 8489     assert(UseAVX > 2, "required");
 8490     int vlen_enc = vector_length_encoding(this);
 8491     if (!VM_Version::supports_avx512vl()) {
 8492       vlen_enc = Assembler::AVX_512bit;
 8493     }
 8494     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8495   %}
 8496   ins_pipe( pipe_slow );
 8497 %}
 8498 
 8499 // --------------------------------- ABSNEG --------------------------------------
 8500 
 8501 instruct vabsnegF(vec dst, vec src) %{
 8502   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8503   match(Set dst (AbsVF src));
 8504   match(Set dst (NegVF src));
 8505   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8506   ins_cost(150);
 8507   ins_encode %{
 8508     int opcode = this->ideal_Opcode();
 8509     int vlen = Matcher::vector_length(this);
 8510     if (vlen == 2) {
 8511       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8512     } else {
 8513       assert(vlen == 8 || vlen == 16, "required");
 8514       int vlen_enc = vector_length_encoding(this);
 8515       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8516     }
 8517   %}
 8518   ins_pipe( pipe_slow );
 8519 %}
 8520 
 8521 instruct vabsneg4F(vec dst) %{
 8522   predicate(Matcher::vector_length(n) == 4);
 8523   match(Set dst (AbsVF dst));
 8524   match(Set dst (NegVF dst));
 8525   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8526   ins_cost(150);
 8527   ins_encode %{
 8528     int opcode = this->ideal_Opcode();
 8529     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8530   %}
 8531   ins_pipe( pipe_slow );
 8532 %}
 8533 
 8534 instruct vabsnegD(vec dst, vec src) %{
 8535   match(Set dst (AbsVD  src));
 8536   match(Set dst (NegVD  src));
 8537   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8538   ins_encode %{
 8539     int opcode = this->ideal_Opcode();
 8540     uint vlen = Matcher::vector_length(this);
 8541     if (vlen == 2) {
 8542       assert(UseSSE >= 2, "required");
 8543       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8544     } else {
 8545       int vlen_enc = vector_length_encoding(this);
 8546       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8547     }
 8548   %}
 8549   ins_pipe( pipe_slow );
 8550 %}
 8551 
 8552 //------------------------------------- VectorTest --------------------------------------------
 8553 
 8554 #ifdef _LP64
 8555 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8556   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8557   match(Set cr (VectorTest src1 src2));
 8558   effect(TEMP vtmp);
 8559   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8560   ins_encode %{
 8561     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8562     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8563     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8564   %}
 8565   ins_pipe( pipe_slow );
 8566 %}
 8567 
 8568 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8569   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8570   match(Set cr (VectorTest src1 src2));
 8571   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8572   ins_encode %{
 8573     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8574     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8575     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8576   %}
 8577   ins_pipe( pipe_slow );
 8578 %}
 8579 
 8580 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8581   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8582              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8583             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8584   match(Set cr (VectorTest src1 src2));
 8585   effect(TEMP tmp);
 8586   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8587   ins_encode %{
 8588     uint masklen = Matcher::vector_length(this, $src1);
 8589     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8590     __ andl($tmp$$Register, (1 << masklen) - 1);
 8591     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8592   %}
 8593   ins_pipe( pipe_slow );
 8594 %}
 8595 
 8596 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8597   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8598              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8599             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8600   match(Set cr (VectorTest src1 src2));
 8601   effect(TEMP tmp);
 8602   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8603   ins_encode %{
 8604     uint masklen = Matcher::vector_length(this, $src1);
 8605     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8606     __ andl($tmp$$Register, (1 << masklen) - 1);
 8607   %}
 8608   ins_pipe( pipe_slow );
 8609 %}
 8610 
 8611 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8612   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8613             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8614   match(Set cr (VectorTest src1 src2));
 8615   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8616   ins_encode %{
 8617     uint masklen = Matcher::vector_length(this, $src1);
 8618     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8619   %}
 8620   ins_pipe( pipe_slow );
 8621 %}
 8622 #endif
 8623 
 8624 //------------------------------------- LoadMask --------------------------------------------
 8625 
 8626 instruct loadMask(legVec dst, legVec src) %{
 8627   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8628   match(Set dst (VectorLoadMask src));
 8629   effect(TEMP dst);
 8630   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8631   ins_encode %{
 8632     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8633     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8634     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8635   %}
 8636   ins_pipe( pipe_slow );
 8637 %}
 8638 
 8639 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8640   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8641   match(Set dst (VectorLoadMask src));
 8642   effect(TEMP xtmp);
 8643   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8644   ins_encode %{
 8645     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8646                         true, Assembler::AVX_512bit);
 8647   %}
 8648   ins_pipe( pipe_slow );
 8649 %}
 8650 
 8651 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8652   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8653   match(Set dst (VectorLoadMask src));
 8654   effect(TEMP xtmp);
 8655   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8656   ins_encode %{
 8657     int vlen_enc = vector_length_encoding(in(1));
 8658     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8659                         false, vlen_enc);
 8660   %}
 8661   ins_pipe( pipe_slow );
 8662 %}
 8663 
 8664 //------------------------------------- StoreMask --------------------------------------------
 8665 
 8666 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8667   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8668   match(Set dst (VectorStoreMask src size));
 8669   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8670   ins_encode %{
 8671     int vlen = Matcher::vector_length(this);
 8672     if (vlen <= 16 && UseAVX <= 2) {
 8673       assert(UseSSE >= 3, "required");
 8674       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8675     } else {
 8676       assert(UseAVX > 0, "required");
 8677       int src_vlen_enc = vector_length_encoding(this, $src);
 8678       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8679     }
 8680   %}
 8681   ins_pipe( pipe_slow );
 8682 %}
 8683 
 8684 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8685   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8686   match(Set dst (VectorStoreMask src size));
 8687   effect(TEMP_DEF dst, TEMP xtmp);
 8688   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8689   ins_encode %{
 8690     int vlen_enc = Assembler::AVX_128bit;
 8691     int vlen = Matcher::vector_length(this);
 8692     if (vlen <= 8) {
 8693       assert(UseSSE >= 3, "required");
 8694       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8695       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8696       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8697     } else {
 8698       assert(UseAVX > 0, "required");
 8699       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8700       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8701       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8702     }
 8703   %}
 8704   ins_pipe( pipe_slow );
 8705 %}
 8706 
 8707 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8708   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8709   match(Set dst (VectorStoreMask src size));
 8710   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8711   effect(TEMP_DEF dst, TEMP xtmp);
 8712   ins_encode %{
 8713     int vlen_enc = Assembler::AVX_128bit;
 8714     int vlen = Matcher::vector_length(this);
 8715     if (vlen <= 4) {
 8716       assert(UseSSE >= 3, "required");
 8717       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8718       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8719       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8720       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8721     } else {
 8722       assert(UseAVX > 0, "required");
 8723       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8724       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8725       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8726       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8727       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8728     }
 8729   %}
 8730   ins_pipe( pipe_slow );
 8731 %}
 8732 
 8733 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8734   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8735   match(Set dst (VectorStoreMask src size));
 8736   effect(TEMP_DEF dst, TEMP xtmp);
 8737   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8738   ins_encode %{
 8739     assert(UseSSE >= 3, "required");
 8740     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8741     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8742     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8743     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8744     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8745   %}
 8746   ins_pipe( pipe_slow );
 8747 %}
 8748 
 8749 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8750   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8751   match(Set dst (VectorStoreMask src size));
 8752   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8753   effect(TEMP_DEF dst, TEMP vtmp);
 8754   ins_encode %{
 8755     int vlen_enc = Assembler::AVX_128bit;
 8756     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8757     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8758     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8759     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8760     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8761     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8762     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8763   %}
 8764   ins_pipe( pipe_slow );
 8765 %}
 8766 
 8767 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8768   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8769   match(Set dst (VectorStoreMask src size));
 8770   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8771   ins_encode %{
 8772     int src_vlen_enc = vector_length_encoding(this, $src);
 8773     int dst_vlen_enc = vector_length_encoding(this);
 8774     if (!VM_Version::supports_avx512vl()) {
 8775       src_vlen_enc = Assembler::AVX_512bit;
 8776     }
 8777     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8778     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8779   %}
 8780   ins_pipe( pipe_slow );
 8781 %}
 8782 
 8783 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8784   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8785   match(Set dst (VectorStoreMask src size));
 8786   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8787   ins_encode %{
 8788     int src_vlen_enc = vector_length_encoding(this, $src);
 8789     int dst_vlen_enc = vector_length_encoding(this);
 8790     if (!VM_Version::supports_avx512vl()) {
 8791       src_vlen_enc = Assembler::AVX_512bit;
 8792     }
 8793     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8794     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8795   %}
 8796   ins_pipe( pipe_slow );
 8797 %}
 8798 
 8799 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8800   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8801   match(Set dst (VectorStoreMask mask size));
 8802   effect(TEMP_DEF dst);
 8803   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8804   ins_encode %{
 8805     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8806     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8807                  false, Assembler::AVX_512bit, noreg);
 8808     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8809   %}
 8810   ins_pipe( pipe_slow );
 8811 %}
 8812 
 8813 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8814   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8815   match(Set dst (VectorStoreMask mask size));
 8816   effect(TEMP_DEF dst);
 8817   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8818   ins_encode %{
 8819     int dst_vlen_enc = vector_length_encoding(this);
 8820     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8821     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8822   %}
 8823   ins_pipe( pipe_slow );
 8824 %}
 8825 
 8826 instruct vmaskcast_evex(kReg dst) %{
 8827   match(Set dst (VectorMaskCast dst));
 8828   ins_cost(0);
 8829   format %{ "vector_mask_cast $dst" %}
 8830   ins_encode %{
 8831     // empty
 8832   %}
 8833   ins_pipe(empty);
 8834 %}
 8835 
 8836 instruct vmaskcast(vec dst) %{
 8837   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8838   match(Set dst (VectorMaskCast dst));
 8839   ins_cost(0);
 8840   format %{ "vector_mask_cast $dst" %}
 8841   ins_encode %{
 8842     // empty
 8843   %}
 8844   ins_pipe(empty);
 8845 %}
 8846 
 8847 instruct vmaskcast_avx(vec dst, vec src) %{
 8848   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8849   match(Set dst (VectorMaskCast src));
 8850   format %{ "vector_mask_cast $dst, $src" %}
 8851   ins_encode %{
 8852     int vlen = Matcher::vector_length(this);
 8853     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8854     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8855     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8856   %}
 8857   ins_pipe(pipe_slow);
 8858 %}
 8859 
 8860 //-------------------------------- Load Iota Indices ----------------------------------
 8861 
 8862 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8863   match(Set dst (VectorLoadConst src));
 8864   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8865   ins_encode %{
 8866      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8867      BasicType bt = Matcher::vector_element_basic_type(this);
 8868      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8869   %}
 8870   ins_pipe( pipe_slow );
 8871 %}
 8872 
 8873 #ifdef _LP64
 8874 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8875   match(Set dst (PopulateIndex src1 src2));
 8876   effect(TEMP dst, TEMP vtmp);
 8877   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8878   ins_encode %{
 8879      assert($src2$$constant == 1, "required");
 8880      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8881      int vlen_enc = vector_length_encoding(this);
 8882      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8883      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8884      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8885      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8886   %}
 8887   ins_pipe( pipe_slow );
 8888 %}
 8889 
 8890 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8891   match(Set dst (PopulateIndex src1 src2));
 8892   effect(TEMP dst, TEMP vtmp);
 8893   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8894   ins_encode %{
 8895      assert($src2$$constant == 1, "required");
 8896      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8897      int vlen_enc = vector_length_encoding(this);
 8898      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8899      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8900      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8901      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8902   %}
 8903   ins_pipe( pipe_slow );
 8904 %}
 8905 #endif
 8906 //-------------------------------- Rearrange ----------------------------------
 8907 
 8908 // LoadShuffle/Rearrange for Byte
 8909 instruct rearrangeB(vec dst, vec shuffle) %{
 8910   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8911             Matcher::vector_length(n) < 32);
 8912   match(Set dst (VectorRearrange dst shuffle));
 8913   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8914   ins_encode %{
 8915     assert(UseSSE >= 4, "required");
 8916     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8917   %}
 8918   ins_pipe( pipe_slow );
 8919 %}
 8920 
 8921 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8922   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8923             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8924   match(Set dst (VectorRearrange src shuffle));
 8925   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8926   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8927   ins_encode %{
 8928     assert(UseAVX >= 2, "required");
 8929     // Swap src into vtmp1
 8930     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8931     // Shuffle swapped src to get entries from other 128 bit lane
 8932     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8933     // Shuffle original src to get entries from self 128 bit lane
 8934     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8935     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8936     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8937     // Perform the blend
 8938     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8939   %}
 8940   ins_pipe( pipe_slow );
 8941 %}
 8942 
 8943 
 8944 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8945   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8946             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8947   match(Set dst (VectorRearrange src shuffle));
 8948   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8949   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8950   ins_encode %{
 8951     int vlen_enc = vector_length_encoding(this);
 8952     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8953                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8954                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8955   %}
 8956   ins_pipe( pipe_slow );
 8957 %}
 8958 
 8959 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8960   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8961             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8962   match(Set dst (VectorRearrange src shuffle));
 8963   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8964   ins_encode %{
 8965     int vlen_enc = vector_length_encoding(this);
 8966     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8967   %}
 8968   ins_pipe( pipe_slow );
 8969 %}
 8970 
 8971 // LoadShuffle/Rearrange for Short
 8972 
 8973 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8974   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8975             !VM_Version::supports_avx512bw());
 8976   match(Set dst (VectorLoadShuffle src));
 8977   effect(TEMP dst, TEMP vtmp);
 8978   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8979   ins_encode %{
 8980     // Create a byte shuffle mask from short shuffle mask
 8981     // only byte shuffle instruction available on these platforms
 8982     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8983     if (UseAVX == 0) {
 8984       assert(vlen_in_bytes <= 16, "required");
 8985       // Multiply each shuffle by two to get byte index
 8986       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8987       __ psllw($vtmp$$XMMRegister, 1);
 8988 
 8989       // Duplicate to create 2 copies of byte index
 8990       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8991       __ psllw($dst$$XMMRegister, 8);
 8992       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8993 
 8994       // Add one to get alternate byte index
 8995       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8996       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8997     } else {
 8998       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8999       int vlen_enc = vector_length_encoding(this);
 9000       // Multiply each shuffle by two to get byte index
 9001       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 9002 
 9003       // Duplicate to create 2 copies of byte index
 9004       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 9005       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9006 
 9007       // Add one to get alternate byte index
 9008       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 9009     }
 9010   %}
 9011   ins_pipe( pipe_slow );
 9012 %}
 9013 
 9014 instruct rearrangeS(vec dst, vec shuffle) %{
 9015   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9016             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 9017   match(Set dst (VectorRearrange dst shuffle));
 9018   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9019   ins_encode %{
 9020     assert(UseSSE >= 4, "required");
 9021     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9022   %}
 9023   ins_pipe( pipe_slow );
 9024 %}
 9025 
 9026 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 9027   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9028             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 9029   match(Set dst (VectorRearrange src shuffle));
 9030   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 9031   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 9032   ins_encode %{
 9033     assert(UseAVX >= 2, "required");
 9034     // Swap src into vtmp1
 9035     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 9036     // Shuffle swapped src to get entries from other 128 bit lane
 9037     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 9038     // Shuffle original src to get entries from self 128 bit lane
 9039     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 9040     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 9041     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 9042     // Perform the blend
 9043     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 9044   %}
 9045   ins_pipe( pipe_slow );
 9046 %}
 9047 
 9048 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 9049   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 9050             VM_Version::supports_avx512bw());
 9051   match(Set dst (VectorRearrange src shuffle));
 9052   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9053   ins_encode %{
 9054     int vlen_enc = vector_length_encoding(this);
 9055     if (!VM_Version::supports_avx512vl()) {
 9056       vlen_enc = Assembler::AVX_512bit;
 9057     }
 9058     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9059   %}
 9060   ins_pipe( pipe_slow );
 9061 %}
 9062 
 9063 // LoadShuffle/Rearrange for Integer and Float
 9064 
 9065 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 9066   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9067             Matcher::vector_length(n) == 4 && UseAVX == 0);
 9068   match(Set dst (VectorLoadShuffle src));
 9069   effect(TEMP dst, TEMP vtmp);
 9070   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9071   ins_encode %{
 9072     assert(UseSSE >= 4, "required");
 9073 
 9074     // Create a byte shuffle mask from int shuffle mask
 9075     // only byte shuffle instruction available on these platforms
 9076 
 9077     // Duplicate and multiply each shuffle by 4
 9078     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 9079     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9080     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 9081     __ psllw($vtmp$$XMMRegister, 2);
 9082 
 9083     // Duplicate again to create 4 copies of byte index
 9084     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 9085     __ psllw($dst$$XMMRegister, 8);
 9086     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 9087 
 9088     // Add 3,2,1,0 to get alternate byte index
 9089     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 9090     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 9091   %}
 9092   ins_pipe( pipe_slow );
 9093 %}
 9094 
 9095 instruct rearrangeI(vec dst, vec shuffle) %{
 9096   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9097             UseAVX == 0);
 9098   match(Set dst (VectorRearrange dst shuffle));
 9099   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 9100   ins_encode %{
 9101     assert(UseSSE >= 4, "required");
 9102     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 9103   %}
 9104   ins_pipe( pipe_slow );
 9105 %}
 9106 
 9107 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 9108   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 9109             UseAVX > 0);
 9110   match(Set dst (VectorRearrange src shuffle));
 9111   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9112   ins_encode %{
 9113     int vlen_enc = vector_length_encoding(this);
 9114     BasicType bt = Matcher::vector_element_basic_type(this);
 9115     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9116   %}
 9117   ins_pipe( pipe_slow );
 9118 %}
 9119 
 9120 // LoadShuffle/Rearrange for Long and Double
 9121 
 9122 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 9123   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9124             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9125   match(Set dst (VectorLoadShuffle src));
 9126   effect(TEMP dst, TEMP vtmp);
 9127   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9128   ins_encode %{
 9129     assert(UseAVX >= 2, "required");
 9130 
 9131     int vlen_enc = vector_length_encoding(this);
 9132     // Create a double word shuffle mask from long shuffle mask
 9133     // only double word shuffle instruction available on these platforms
 9134 
 9135     // Multiply each shuffle by two to get double word index
 9136     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 9137 
 9138     // Duplicate each double word shuffle
 9139     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9140     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9141 
 9142     // Add one to get alternate double word index
 9143     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9144   %}
 9145   ins_pipe( pipe_slow );
 9146 %}
 9147 
 9148 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9149   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9150             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9151   match(Set dst (VectorRearrange src shuffle));
 9152   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9153   ins_encode %{
 9154     assert(UseAVX >= 2, "required");
 9155 
 9156     int vlen_enc = vector_length_encoding(this);
 9157     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9158   %}
 9159   ins_pipe( pipe_slow );
 9160 %}
 9161 
 9162 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9163   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9164             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9165   match(Set dst (VectorRearrange src shuffle));
 9166   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9167   ins_encode %{
 9168     assert(UseAVX > 2, "required");
 9169 
 9170     int vlen_enc = vector_length_encoding(this);
 9171     if (vlen_enc == Assembler::AVX_128bit) {
 9172       vlen_enc = Assembler::AVX_256bit;
 9173     }
 9174     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9175   %}
 9176   ins_pipe( pipe_slow );
 9177 %}
 9178 
 9179 // --------------------------------- FMA --------------------------------------
 9180 // a * b + c
 9181 
 9182 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9183   match(Set c (FmaVF  c (Binary a b)));
 9184   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9185   ins_cost(150);
 9186   ins_encode %{
 9187     assert(UseFMA, "not enabled");
 9188     int vlen_enc = vector_length_encoding(this);
 9189     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9190   %}
 9191   ins_pipe( pipe_slow );
 9192 %}
 9193 
 9194 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9195   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9196   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9197   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9198   ins_cost(150);
 9199   ins_encode %{
 9200     assert(UseFMA, "not enabled");
 9201     int vlen_enc = vector_length_encoding(this);
 9202     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9203   %}
 9204   ins_pipe( pipe_slow );
 9205 %}
 9206 
 9207 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9208   match(Set c (FmaVD  c (Binary a b)));
 9209   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9210   ins_cost(150);
 9211   ins_encode %{
 9212     assert(UseFMA, "not enabled");
 9213     int vlen_enc = vector_length_encoding(this);
 9214     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9215   %}
 9216   ins_pipe( pipe_slow );
 9217 %}
 9218 
 9219 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9220   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9221   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9222   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9223   ins_cost(150);
 9224   ins_encode %{
 9225     assert(UseFMA, "not enabled");
 9226     int vlen_enc = vector_length_encoding(this);
 9227     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9228   %}
 9229   ins_pipe( pipe_slow );
 9230 %}
 9231 
 9232 // --------------------------------- Vector Multiply Add --------------------------------------
 9233 
 9234 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9235   predicate(UseAVX == 0);
 9236   match(Set dst (MulAddVS2VI dst src1));
 9237   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9238   ins_encode %{
 9239     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9240   %}
 9241   ins_pipe( pipe_slow );
 9242 %}
 9243 
 9244 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9245   predicate(UseAVX > 0);
 9246   match(Set dst (MulAddVS2VI src1 src2));
 9247   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9248   ins_encode %{
 9249     int vlen_enc = vector_length_encoding(this);
 9250     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9251   %}
 9252   ins_pipe( pipe_slow );
 9253 %}
 9254 
 9255 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9256 
 9257 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9258   predicate(VM_Version::supports_avx512_vnni());
 9259   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9260   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9261   ins_encode %{
 9262     assert(UseAVX > 2, "required");
 9263     int vlen_enc = vector_length_encoding(this);
 9264     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9265   %}
 9266   ins_pipe( pipe_slow );
 9267   ins_cost(10);
 9268 %}
 9269 
 9270 // --------------------------------- PopCount --------------------------------------
 9271 
 9272 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9273   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9274   match(Set dst (PopCountVI src));
 9275   match(Set dst (PopCountVL src));
 9276   format %{ "vector_popcount_integral $dst, $src" %}
 9277   ins_encode %{
 9278     int opcode = this->ideal_Opcode();
 9279     int vlen_enc = vector_length_encoding(this, $src);
 9280     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9281     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9282   %}
 9283   ins_pipe( pipe_slow );
 9284 %}
 9285 
 9286 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9287   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9288   match(Set dst (PopCountVI src mask));
 9289   match(Set dst (PopCountVL src mask));
 9290   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9291   ins_encode %{
 9292     int vlen_enc = vector_length_encoding(this, $src);
 9293     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9294     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9295     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9296   %}
 9297   ins_pipe( pipe_slow );
 9298 %}
 9299 
 9300 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9301   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9302   match(Set dst (PopCountVI src));
 9303   match(Set dst (PopCountVL src));
 9304   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9305   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9306   ins_encode %{
 9307     int opcode = this->ideal_Opcode();
 9308     int vlen_enc = vector_length_encoding(this, $src);
 9309     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9310     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9311                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9312   %}
 9313   ins_pipe( pipe_slow );
 9314 %}
 9315 
 9316 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9317 
 9318 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9319   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9320                                               Matcher::vector_length_in_bytes(n->in(1))));
 9321   match(Set dst (CountTrailingZerosV src));
 9322   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9323   ins_cost(400);
 9324   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9325   ins_encode %{
 9326     int vlen_enc = vector_length_encoding(this, $src);
 9327     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9328     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9329                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9330   %}
 9331   ins_pipe( pipe_slow );
 9332 %}
 9333 
 9334 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9335   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9336             VM_Version::supports_avx512cd() &&
 9337             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9338   match(Set dst (CountTrailingZerosV src));
 9339   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9340   ins_cost(400);
 9341   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9342   ins_encode %{
 9343     int vlen_enc = vector_length_encoding(this, $src);
 9344     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9345     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9346                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9347   %}
 9348   ins_pipe( pipe_slow );
 9349 %}
 9350 
 9351 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9352   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9353   match(Set dst (CountTrailingZerosV src));
 9354   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9355   ins_cost(400);
 9356   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9357   ins_encode %{
 9358     int vlen_enc = vector_length_encoding(this, $src);
 9359     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9360     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9361                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9362                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9363   %}
 9364   ins_pipe( pipe_slow );
 9365 %}
 9366 
 9367 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9368   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9369   match(Set dst (CountTrailingZerosV src));
 9370   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9371   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9372   ins_encode %{
 9373     int vlen_enc = vector_length_encoding(this, $src);
 9374     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9375     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9376                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9377   %}
 9378   ins_pipe( pipe_slow );
 9379 %}
 9380 
 9381 
 9382 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9383 
 9384 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9385   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9386   effect(TEMP dst);
 9387   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9388   ins_encode %{
 9389     int vector_len = vector_length_encoding(this);
 9390     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9391   %}
 9392   ins_pipe( pipe_slow );
 9393 %}
 9394 
 9395 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9396   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9397   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9398   effect(TEMP dst);
 9399   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9400   ins_encode %{
 9401     int vector_len = vector_length_encoding(this);
 9402     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9403   %}
 9404   ins_pipe( pipe_slow );
 9405 %}
 9406 
 9407 // --------------------------------- Rotation Operations ----------------------------------
 9408 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9409   match(Set dst (RotateLeftV src shift));
 9410   match(Set dst (RotateRightV src shift));
 9411   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9412   ins_encode %{
 9413     int opcode      = this->ideal_Opcode();
 9414     int vector_len  = vector_length_encoding(this);
 9415     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9416     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9417   %}
 9418   ins_pipe( pipe_slow );
 9419 %}
 9420 
 9421 instruct vprorate(vec dst, vec src, vec shift) %{
 9422   match(Set dst (RotateLeftV src shift));
 9423   match(Set dst (RotateRightV src shift));
 9424   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9425   ins_encode %{
 9426     int opcode      = this->ideal_Opcode();
 9427     int vector_len  = vector_length_encoding(this);
 9428     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9429     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9430   %}
 9431   ins_pipe( pipe_slow );
 9432 %}
 9433 
 9434 // ---------------------------------- Masked Operations ------------------------------------
 9435 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9436   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9437   match(Set dst (LoadVectorMasked mem mask));
 9438   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9439   ins_encode %{
 9440     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9441     int vlen_enc = vector_length_encoding(this);
 9442     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9443   %}
 9444   ins_pipe( pipe_slow );
 9445 %}
 9446 
 9447 
 9448 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9449   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9450   match(Set dst (LoadVectorMasked mem mask));
 9451   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9452   ins_encode %{
 9453     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9454     int vector_len = vector_length_encoding(this);
 9455     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9456   %}
 9457   ins_pipe( pipe_slow );
 9458 %}
 9459 
 9460 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9461   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9462   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9463   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9464   ins_encode %{
 9465     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9466     int vlen_enc = vector_length_encoding(src_node);
 9467     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9468     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9469   %}
 9470   ins_pipe( pipe_slow );
 9471 %}
 9472 
 9473 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9474   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9475   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9476   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9477   ins_encode %{
 9478     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9479     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9480     int vlen_enc = vector_length_encoding(src_node);
 9481     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9482   %}
 9483   ins_pipe( pipe_slow );
 9484 %}
 9485 
 9486 #ifdef _LP64
 9487 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9488   match(Set addr (VerifyVectorAlignment addr mask));
 9489   effect(KILL cr);
 9490   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9491   ins_encode %{
 9492     Label Lskip;
 9493     // check if masked bits of addr are zero
 9494     __ testq($addr$$Register, $mask$$constant);
 9495     __ jccb(Assembler::equal, Lskip);
 9496     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9497     __ bind(Lskip);
 9498   %}
 9499   ins_pipe(pipe_slow);
 9500 %}
 9501 
 9502 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9503   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9504   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9505   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9506   ins_encode %{
 9507     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9508     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9509 
 9510     Label DONE;
 9511     int vlen_enc = vector_length_encoding(this, $src1);
 9512     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9513 
 9514     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9515     __ mov64($dst$$Register, -1L);
 9516     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9517     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9518     __ jccb(Assembler::carrySet, DONE);
 9519     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9520     __ notq($dst$$Register);
 9521     __ tzcntq($dst$$Register, $dst$$Register);
 9522     __ bind(DONE);
 9523   %}
 9524   ins_pipe( pipe_slow );
 9525 %}
 9526 
 9527 
 9528 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9529   match(Set dst (VectorMaskGen len));
 9530   effect(TEMP temp, KILL cr);
 9531   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9532   ins_encode %{
 9533     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9534   %}
 9535   ins_pipe( pipe_slow );
 9536 %}
 9537 
 9538 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9539   match(Set dst (VectorMaskGen len));
 9540   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9541   effect(TEMP temp);
 9542   ins_encode %{
 9543     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9544     __ kmovql($dst$$KRegister, $temp$$Register);
 9545   %}
 9546   ins_pipe( pipe_slow );
 9547 %}
 9548 
 9549 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9550   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9551   match(Set dst (VectorMaskToLong mask));
 9552   effect(TEMP dst, KILL cr);
 9553   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9554   ins_encode %{
 9555     int opcode = this->ideal_Opcode();
 9556     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9557     int mask_len = Matcher::vector_length(this, $mask);
 9558     int mask_size = mask_len * type2aelembytes(mbt);
 9559     int vlen_enc = vector_length_encoding(this, $mask);
 9560     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9561                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9562   %}
 9563   ins_pipe( pipe_slow );
 9564 %}
 9565 
 9566 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9567   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9568   match(Set dst (VectorMaskToLong mask));
 9569   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9570   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9571   ins_encode %{
 9572     int opcode = this->ideal_Opcode();
 9573     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9574     int mask_len = Matcher::vector_length(this, $mask);
 9575     int vlen_enc = vector_length_encoding(this, $mask);
 9576     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9577                              $dst$$Register, mask_len, mbt, vlen_enc);
 9578   %}
 9579   ins_pipe( pipe_slow );
 9580 %}
 9581 
 9582 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9583   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9584   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9585   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9586   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9587   ins_encode %{
 9588     int opcode = this->ideal_Opcode();
 9589     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9590     int mask_len = Matcher::vector_length(this, $mask);
 9591     int vlen_enc = vector_length_encoding(this, $mask);
 9592     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9593                              $dst$$Register, mask_len, mbt, vlen_enc);
 9594   %}
 9595   ins_pipe( pipe_slow );
 9596 %}
 9597 
 9598 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9599   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9600   match(Set dst (VectorMaskTrueCount mask));
 9601   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9602   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9603   ins_encode %{
 9604     int opcode = this->ideal_Opcode();
 9605     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9606     int mask_len = Matcher::vector_length(this, $mask);
 9607     int mask_size = mask_len * type2aelembytes(mbt);
 9608     int vlen_enc = vector_length_encoding(this, $mask);
 9609     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9610                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9611   %}
 9612   ins_pipe( pipe_slow );
 9613 %}
 9614 
 9615 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9616   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9617   match(Set dst (VectorMaskTrueCount mask));
 9618   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9619   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9620   ins_encode %{
 9621     int opcode = this->ideal_Opcode();
 9622     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9623     int mask_len = Matcher::vector_length(this, $mask);
 9624     int vlen_enc = vector_length_encoding(this, $mask);
 9625     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9626                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9627   %}
 9628   ins_pipe( pipe_slow );
 9629 %}
 9630 
 9631 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9632   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9633   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9634   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9635   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9636   ins_encode %{
 9637     int opcode = this->ideal_Opcode();
 9638     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9639     int mask_len = Matcher::vector_length(this, $mask);
 9640     int vlen_enc = vector_length_encoding(this, $mask);
 9641     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9642                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9643   %}
 9644   ins_pipe( pipe_slow );
 9645 %}
 9646 
 9647 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9648   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9649   match(Set dst (VectorMaskFirstTrue mask));
 9650   match(Set dst (VectorMaskLastTrue mask));
 9651   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9652   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9653   ins_encode %{
 9654     int opcode = this->ideal_Opcode();
 9655     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9656     int mask_len = Matcher::vector_length(this, $mask);
 9657     int mask_size = mask_len * type2aelembytes(mbt);
 9658     int vlen_enc = vector_length_encoding(this, $mask);
 9659     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9660                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9661   %}
 9662   ins_pipe( pipe_slow );
 9663 %}
 9664 
 9665 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9666   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9667   match(Set dst (VectorMaskFirstTrue mask));
 9668   match(Set dst (VectorMaskLastTrue mask));
 9669   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9670   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9671   ins_encode %{
 9672     int opcode = this->ideal_Opcode();
 9673     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9674     int mask_len = Matcher::vector_length(this, $mask);
 9675     int vlen_enc = vector_length_encoding(this, $mask);
 9676     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9677                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9678   %}
 9679   ins_pipe( pipe_slow );
 9680 %}
 9681 
 9682 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9683   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9684   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9685   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9686   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9687   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9688   ins_encode %{
 9689     int opcode = this->ideal_Opcode();
 9690     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9691     int mask_len = Matcher::vector_length(this, $mask);
 9692     int vlen_enc = vector_length_encoding(this, $mask);
 9693     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9694                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9695   %}
 9696   ins_pipe( pipe_slow );
 9697 %}
 9698 
 9699 // --------------------------------- Compress/Expand Operations ---------------------------
 9700 #ifdef _LP64
 9701 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9702   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9703   match(Set dst (CompressV src mask));
 9704   match(Set dst (ExpandV src mask));
 9705   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9706   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9707   ins_encode %{
 9708     int opcode = this->ideal_Opcode();
 9709     int vlen_enc = vector_length_encoding(this);
 9710     BasicType bt  = Matcher::vector_element_basic_type(this);
 9711     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9712                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9713   %}
 9714   ins_pipe( pipe_slow );
 9715 %}
 9716 #endif
 9717 
 9718 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9719   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9720   match(Set dst (CompressV src mask));
 9721   match(Set dst (ExpandV src mask));
 9722   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9723   ins_encode %{
 9724     int opcode = this->ideal_Opcode();
 9725     int vector_len = vector_length_encoding(this);
 9726     BasicType bt  = Matcher::vector_element_basic_type(this);
 9727     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9728   %}
 9729   ins_pipe( pipe_slow );
 9730 %}
 9731 
 9732 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9733   match(Set dst (CompressM mask));
 9734   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9735   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9736   ins_encode %{
 9737     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9738     int mask_len = Matcher::vector_length(this);
 9739     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9740   %}
 9741   ins_pipe( pipe_slow );
 9742 %}
 9743 
 9744 #endif // _LP64
 9745 
 9746 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9747 
 9748 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9749   predicate(!VM_Version::supports_gfni());
 9750   match(Set dst (ReverseV src));
 9751   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9752   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9753   ins_encode %{
 9754     int vec_enc = vector_length_encoding(this);
 9755     BasicType bt = Matcher::vector_element_basic_type(this);
 9756     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9757                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9758   %}
 9759   ins_pipe( pipe_slow );
 9760 %}
 9761 
 9762 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9763   predicate(VM_Version::supports_gfni());
 9764   match(Set dst (ReverseV src));
 9765   effect(TEMP dst, TEMP xtmp);
 9766   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9767   ins_encode %{
 9768     int vec_enc = vector_length_encoding(this);
 9769     BasicType bt  = Matcher::vector_element_basic_type(this);
 9770     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9771     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9772                                $xtmp$$XMMRegister);
 9773   %}
 9774   ins_pipe( pipe_slow );
 9775 %}
 9776 
 9777 instruct vreverse_byte_reg(vec dst, vec src) %{
 9778   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9779   match(Set dst (ReverseBytesV src));
 9780   effect(TEMP dst);
 9781   format %{ "vector_reverse_byte $dst, $src" %}
 9782   ins_encode %{
 9783     int vec_enc = vector_length_encoding(this);
 9784     BasicType bt = Matcher::vector_element_basic_type(this);
 9785     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9786   %}
 9787   ins_pipe( pipe_slow );
 9788 %}
 9789 
 9790 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9791   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9792   match(Set dst (ReverseBytesV src));
 9793   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9794   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9795   ins_encode %{
 9796     int vec_enc = vector_length_encoding(this);
 9797     BasicType bt = Matcher::vector_element_basic_type(this);
 9798     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9799                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9800   %}
 9801   ins_pipe( pipe_slow );
 9802 %}
 9803 
 9804 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9805 
 9806 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9807   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9808                                               Matcher::vector_length_in_bytes(n->in(1))));
 9809   match(Set dst (CountLeadingZerosV src));
 9810   format %{ "vector_count_leading_zeros $dst, $src" %}
 9811   ins_encode %{
 9812      int vlen_enc = vector_length_encoding(this, $src);
 9813      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9814      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9815                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9816   %}
 9817   ins_pipe( pipe_slow );
 9818 %}
 9819 
 9820 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9821   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9822                                               Matcher::vector_length_in_bytes(n->in(1))));
 9823   match(Set dst (CountLeadingZerosV src mask));
 9824   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9825   ins_encode %{
 9826     int vlen_enc = vector_length_encoding(this, $src);
 9827     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9828     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9829     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9830                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9831   %}
 9832   ins_pipe( pipe_slow );
 9833 %}
 9834 
 9835 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9836   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9837             VM_Version::supports_avx512cd() &&
 9838             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9839   match(Set dst (CountLeadingZerosV src));
 9840   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9841   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9842   ins_encode %{
 9843     int vlen_enc = vector_length_encoding(this, $src);
 9844     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9845     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9846                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9847   %}
 9848   ins_pipe( pipe_slow );
 9849 %}
 9850 
 9851 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9852   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9853   match(Set dst (CountLeadingZerosV src));
 9854   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9855   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9856   ins_encode %{
 9857     int vlen_enc = vector_length_encoding(this, $src);
 9858     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9859     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9860                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9861                                        $rtmp$$Register, true, vlen_enc);
 9862   %}
 9863   ins_pipe( pipe_slow );
 9864 %}
 9865 
 9866 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9867   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9868             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9869   match(Set dst (CountLeadingZerosV src));
 9870   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9871   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9872   ins_encode %{
 9873     int vlen_enc = vector_length_encoding(this, $src);
 9874     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9875     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9876                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9877   %}
 9878   ins_pipe( pipe_slow );
 9879 %}
 9880 
 9881 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9882   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9883             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9884   match(Set dst (CountLeadingZerosV src));
 9885   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9886   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9887   ins_encode %{
 9888     int vlen_enc = vector_length_encoding(this, $src);
 9889     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9890     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9891                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9892   %}
 9893   ins_pipe( pipe_slow );
 9894 %}
 9895 
 9896 // ---------------------------------- Vector Masked Operations ------------------------------------
 9897 
 9898 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9899   match(Set dst (AddVB (Binary dst src2) mask));
 9900   match(Set dst (AddVS (Binary dst src2) mask));
 9901   match(Set dst (AddVI (Binary dst src2) mask));
 9902   match(Set dst (AddVL (Binary dst src2) mask));
 9903   match(Set dst (AddVF (Binary dst src2) mask));
 9904   match(Set dst (AddVD (Binary dst src2) mask));
 9905   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9906   ins_encode %{
 9907     int vlen_enc = vector_length_encoding(this);
 9908     BasicType bt = Matcher::vector_element_basic_type(this);
 9909     int opc = this->ideal_Opcode();
 9910     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9911                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9912   %}
 9913   ins_pipe( pipe_slow );
 9914 %}
 9915 
 9916 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9917   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9918   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9919   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9920   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9921   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9922   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9923   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9924   ins_encode %{
 9925     int vlen_enc = vector_length_encoding(this);
 9926     BasicType bt = Matcher::vector_element_basic_type(this);
 9927     int opc = this->ideal_Opcode();
 9928     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9929                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9930   %}
 9931   ins_pipe( pipe_slow );
 9932 %}
 9933 
 9934 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9935   match(Set dst (XorV (Binary dst src2) mask));
 9936   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9937   ins_encode %{
 9938     int vlen_enc = vector_length_encoding(this);
 9939     BasicType bt = Matcher::vector_element_basic_type(this);
 9940     int opc = this->ideal_Opcode();
 9941     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9942                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9943   %}
 9944   ins_pipe( pipe_slow );
 9945 %}
 9946 
 9947 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9948   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9949   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9950   ins_encode %{
 9951     int vlen_enc = vector_length_encoding(this);
 9952     BasicType bt = Matcher::vector_element_basic_type(this);
 9953     int opc = this->ideal_Opcode();
 9954     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9955                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9956   %}
 9957   ins_pipe( pipe_slow );
 9958 %}
 9959 
 9960 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9961   match(Set dst (OrV (Binary dst src2) mask));
 9962   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9963   ins_encode %{
 9964     int vlen_enc = vector_length_encoding(this);
 9965     BasicType bt = Matcher::vector_element_basic_type(this);
 9966     int opc = this->ideal_Opcode();
 9967     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9968                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9969   %}
 9970   ins_pipe( pipe_slow );
 9971 %}
 9972 
 9973 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9974   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9975   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9976   ins_encode %{
 9977     int vlen_enc = vector_length_encoding(this);
 9978     BasicType bt = Matcher::vector_element_basic_type(this);
 9979     int opc = this->ideal_Opcode();
 9980     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9981                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9982   %}
 9983   ins_pipe( pipe_slow );
 9984 %}
 9985 
 9986 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9987   match(Set dst (AndV (Binary dst src2) mask));
 9988   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9989   ins_encode %{
 9990     int vlen_enc = vector_length_encoding(this);
 9991     BasicType bt = Matcher::vector_element_basic_type(this);
 9992     int opc = this->ideal_Opcode();
 9993     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9994                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9995   %}
 9996   ins_pipe( pipe_slow );
 9997 %}
 9998 
 9999 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
10000   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
10001   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
10002   ins_encode %{
10003     int vlen_enc = vector_length_encoding(this);
10004     BasicType bt = Matcher::vector_element_basic_type(this);
10005     int opc = this->ideal_Opcode();
10006     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10007                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10008   %}
10009   ins_pipe( pipe_slow );
10010 %}
10011 
10012 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
10013   match(Set dst (SubVB (Binary dst src2) mask));
10014   match(Set dst (SubVS (Binary dst src2) mask));
10015   match(Set dst (SubVI (Binary dst src2) mask));
10016   match(Set dst (SubVL (Binary dst src2) mask));
10017   match(Set dst (SubVF (Binary dst src2) mask));
10018   match(Set dst (SubVD (Binary dst src2) mask));
10019   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
10020   ins_encode %{
10021     int vlen_enc = vector_length_encoding(this);
10022     BasicType bt = Matcher::vector_element_basic_type(this);
10023     int opc = this->ideal_Opcode();
10024     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10025                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10026   %}
10027   ins_pipe( pipe_slow );
10028 %}
10029 
10030 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
10031   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
10032   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
10033   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
10034   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
10035   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
10036   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
10037   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
10038   ins_encode %{
10039     int vlen_enc = vector_length_encoding(this);
10040     BasicType bt = Matcher::vector_element_basic_type(this);
10041     int opc = this->ideal_Opcode();
10042     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10043                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10044   %}
10045   ins_pipe( pipe_slow );
10046 %}
10047 
10048 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
10049   match(Set dst (MulVS (Binary dst src2) mask));
10050   match(Set dst (MulVI (Binary dst src2) mask));
10051   match(Set dst (MulVL (Binary dst src2) mask));
10052   match(Set dst (MulVF (Binary dst src2) mask));
10053   match(Set dst (MulVD (Binary dst src2) mask));
10054   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10055   ins_encode %{
10056     int vlen_enc = vector_length_encoding(this);
10057     BasicType bt = Matcher::vector_element_basic_type(this);
10058     int opc = this->ideal_Opcode();
10059     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10060                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10061   %}
10062   ins_pipe( pipe_slow );
10063 %}
10064 
10065 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
10066   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
10067   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
10068   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
10069   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
10070   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
10071   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
10072   ins_encode %{
10073     int vlen_enc = vector_length_encoding(this);
10074     BasicType bt = Matcher::vector_element_basic_type(this);
10075     int opc = this->ideal_Opcode();
10076     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10077                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10078   %}
10079   ins_pipe( pipe_slow );
10080 %}
10081 
10082 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
10083   match(Set dst (SqrtVF dst mask));
10084   match(Set dst (SqrtVD dst mask));
10085   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
10086   ins_encode %{
10087     int vlen_enc = vector_length_encoding(this);
10088     BasicType bt = Matcher::vector_element_basic_type(this);
10089     int opc = this->ideal_Opcode();
10090     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10091                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10092   %}
10093   ins_pipe( pipe_slow );
10094 %}
10095 
10096 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
10097   match(Set dst (DivVF (Binary dst src2) mask));
10098   match(Set dst (DivVD (Binary dst src2) mask));
10099   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10100   ins_encode %{
10101     int vlen_enc = vector_length_encoding(this);
10102     BasicType bt = Matcher::vector_element_basic_type(this);
10103     int opc = this->ideal_Opcode();
10104     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10105                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10106   %}
10107   ins_pipe( pipe_slow );
10108 %}
10109 
10110 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
10111   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
10112   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
10113   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
10114   ins_encode %{
10115     int vlen_enc = vector_length_encoding(this);
10116     BasicType bt = Matcher::vector_element_basic_type(this);
10117     int opc = this->ideal_Opcode();
10118     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10119                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10120   %}
10121   ins_pipe( pipe_slow );
10122 %}
10123 
10124 
10125 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10126   match(Set dst (RotateLeftV (Binary dst shift) mask));
10127   match(Set dst (RotateRightV (Binary dst shift) mask));
10128   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10129   ins_encode %{
10130     int vlen_enc = vector_length_encoding(this);
10131     BasicType bt = Matcher::vector_element_basic_type(this);
10132     int opc = this->ideal_Opcode();
10133     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10134                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10135   %}
10136   ins_pipe( pipe_slow );
10137 %}
10138 
10139 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10140   match(Set dst (RotateLeftV (Binary dst src2) mask));
10141   match(Set dst (RotateRightV (Binary dst src2) mask));
10142   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10143   ins_encode %{
10144     int vlen_enc = vector_length_encoding(this);
10145     BasicType bt = Matcher::vector_element_basic_type(this);
10146     int opc = this->ideal_Opcode();
10147     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10148                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10149   %}
10150   ins_pipe( pipe_slow );
10151 %}
10152 
10153 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10154   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10155   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10156   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10157   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10158   ins_encode %{
10159     int vlen_enc = vector_length_encoding(this);
10160     BasicType bt = Matcher::vector_element_basic_type(this);
10161     int opc = this->ideal_Opcode();
10162     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10163                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10164   %}
10165   ins_pipe( pipe_slow );
10166 %}
10167 
10168 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10169   predicate(!n->as_ShiftV()->is_var_shift());
10170   match(Set dst (LShiftVS (Binary dst src2) mask));
10171   match(Set dst (LShiftVI (Binary dst src2) mask));
10172   match(Set dst (LShiftVL (Binary dst src2) mask));
10173   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10174   ins_encode %{
10175     int vlen_enc = vector_length_encoding(this);
10176     BasicType bt = Matcher::vector_element_basic_type(this);
10177     int opc = this->ideal_Opcode();
10178     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10179                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10180   %}
10181   ins_pipe( pipe_slow );
10182 %}
10183 
10184 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10185   predicate(n->as_ShiftV()->is_var_shift());
10186   match(Set dst (LShiftVS (Binary dst src2) mask));
10187   match(Set dst (LShiftVI (Binary dst src2) mask));
10188   match(Set dst (LShiftVL (Binary dst src2) mask));
10189   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10190   ins_encode %{
10191     int vlen_enc = vector_length_encoding(this);
10192     BasicType bt = Matcher::vector_element_basic_type(this);
10193     int opc = this->ideal_Opcode();
10194     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10195                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10196   %}
10197   ins_pipe( pipe_slow );
10198 %}
10199 
10200 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10201   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10202   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10203   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10204   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10205   ins_encode %{
10206     int vlen_enc = vector_length_encoding(this);
10207     BasicType bt = Matcher::vector_element_basic_type(this);
10208     int opc = this->ideal_Opcode();
10209     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10210                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10211   %}
10212   ins_pipe( pipe_slow );
10213 %}
10214 
10215 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10216   predicate(!n->as_ShiftV()->is_var_shift());
10217   match(Set dst (RShiftVS (Binary dst src2) mask));
10218   match(Set dst (RShiftVI (Binary dst src2) mask));
10219   match(Set dst (RShiftVL (Binary dst src2) mask));
10220   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10221   ins_encode %{
10222     int vlen_enc = vector_length_encoding(this);
10223     BasicType bt = Matcher::vector_element_basic_type(this);
10224     int opc = this->ideal_Opcode();
10225     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10226                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10227   %}
10228   ins_pipe( pipe_slow );
10229 %}
10230 
10231 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10232   predicate(n->as_ShiftV()->is_var_shift());
10233   match(Set dst (RShiftVS (Binary dst src2) mask));
10234   match(Set dst (RShiftVI (Binary dst src2) mask));
10235   match(Set dst (RShiftVL (Binary dst src2) mask));
10236   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10237   ins_encode %{
10238     int vlen_enc = vector_length_encoding(this);
10239     BasicType bt = Matcher::vector_element_basic_type(this);
10240     int opc = this->ideal_Opcode();
10241     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10242                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10243   %}
10244   ins_pipe( pipe_slow );
10245 %}
10246 
10247 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10248   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10249   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10250   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10251   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10252   ins_encode %{
10253     int vlen_enc = vector_length_encoding(this);
10254     BasicType bt = Matcher::vector_element_basic_type(this);
10255     int opc = this->ideal_Opcode();
10256     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10257                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10258   %}
10259   ins_pipe( pipe_slow );
10260 %}
10261 
10262 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10263   predicate(!n->as_ShiftV()->is_var_shift());
10264   match(Set dst (URShiftVS (Binary dst src2) mask));
10265   match(Set dst (URShiftVI (Binary dst src2) mask));
10266   match(Set dst (URShiftVL (Binary dst src2) mask));
10267   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10268   ins_encode %{
10269     int vlen_enc = vector_length_encoding(this);
10270     BasicType bt = Matcher::vector_element_basic_type(this);
10271     int opc = this->ideal_Opcode();
10272     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10273                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10274   %}
10275   ins_pipe( pipe_slow );
10276 %}
10277 
10278 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10279   predicate(n->as_ShiftV()->is_var_shift());
10280   match(Set dst (URShiftVS (Binary dst src2) mask));
10281   match(Set dst (URShiftVI (Binary dst src2) mask));
10282   match(Set dst (URShiftVL (Binary dst src2) mask));
10283   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10284   ins_encode %{
10285     int vlen_enc = vector_length_encoding(this);
10286     BasicType bt = Matcher::vector_element_basic_type(this);
10287     int opc = this->ideal_Opcode();
10288     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10289                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10290   %}
10291   ins_pipe( pipe_slow );
10292 %}
10293 
10294 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10295   match(Set dst (MaxV (Binary dst src2) mask));
10296   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10297   ins_encode %{
10298     int vlen_enc = vector_length_encoding(this);
10299     BasicType bt = Matcher::vector_element_basic_type(this);
10300     int opc = this->ideal_Opcode();
10301     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10302                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10303   %}
10304   ins_pipe( pipe_slow );
10305 %}
10306 
10307 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10308   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10309   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10310   ins_encode %{
10311     int vlen_enc = vector_length_encoding(this);
10312     BasicType bt = Matcher::vector_element_basic_type(this);
10313     int opc = this->ideal_Opcode();
10314     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10315                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10316   %}
10317   ins_pipe( pipe_slow );
10318 %}
10319 
10320 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10321   match(Set dst (MinV (Binary dst src2) mask));
10322   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10323   ins_encode %{
10324     int vlen_enc = vector_length_encoding(this);
10325     BasicType bt = Matcher::vector_element_basic_type(this);
10326     int opc = this->ideal_Opcode();
10327     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10328                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10329   %}
10330   ins_pipe( pipe_slow );
10331 %}
10332 
10333 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10334   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10335   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10336   ins_encode %{
10337     int vlen_enc = vector_length_encoding(this);
10338     BasicType bt = Matcher::vector_element_basic_type(this);
10339     int opc = this->ideal_Opcode();
10340     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10341                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10342   %}
10343   ins_pipe( pipe_slow );
10344 %}
10345 
10346 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10347   match(Set dst (VectorRearrange (Binary dst src2) mask));
10348   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10349   ins_encode %{
10350     int vlen_enc = vector_length_encoding(this);
10351     BasicType bt = Matcher::vector_element_basic_type(this);
10352     int opc = this->ideal_Opcode();
10353     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10354                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10355   %}
10356   ins_pipe( pipe_slow );
10357 %}
10358 
10359 instruct vabs_masked(vec dst, kReg mask) %{
10360   match(Set dst (AbsVB dst mask));
10361   match(Set dst (AbsVS dst mask));
10362   match(Set dst (AbsVI dst mask));
10363   match(Set dst (AbsVL dst mask));
10364   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10365   ins_encode %{
10366     int vlen_enc = vector_length_encoding(this);
10367     BasicType bt = Matcher::vector_element_basic_type(this);
10368     int opc = this->ideal_Opcode();
10369     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10370                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10371   %}
10372   ins_pipe( pipe_slow );
10373 %}
10374 
10375 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10376   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10377   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10378   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10379   ins_encode %{
10380     assert(UseFMA, "Needs FMA instructions support.");
10381     int vlen_enc = vector_length_encoding(this);
10382     BasicType bt = Matcher::vector_element_basic_type(this);
10383     int opc = this->ideal_Opcode();
10384     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10385                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10386   %}
10387   ins_pipe( pipe_slow );
10388 %}
10389 
10390 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10391   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10392   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10393   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10394   ins_encode %{
10395     assert(UseFMA, "Needs FMA instructions support.");
10396     int vlen_enc = vector_length_encoding(this);
10397     BasicType bt = Matcher::vector_element_basic_type(this);
10398     int opc = this->ideal_Opcode();
10399     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10400                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10401   %}
10402   ins_pipe( pipe_slow );
10403 %}
10404 
10405 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10406   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10407   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10408   ins_encode %{
10409     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10410     int vlen_enc = vector_length_encoding(this, $src1);
10411     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10412 
10413     // Comparison i
10414     switch (src1_elem_bt) {
10415       case T_BYTE: {
10416         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10417         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10418         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10419         break;
10420       }
10421       case T_SHORT: {
10422         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10423         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10424         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10425         break;
10426       }
10427       case T_INT: {
10428         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10429         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10430         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10431         break;
10432       }
10433       case T_LONG: {
10434         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10435         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10436         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10437         break;
10438       }
10439       case T_FLOAT: {
10440         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10441         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10442         break;
10443       }
10444       case T_DOUBLE: {
10445         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10446         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10447         break;
10448       }
10449       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10450     }
10451   %}
10452   ins_pipe( pipe_slow );
10453 %}
10454 
10455 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10456   predicate(Matcher::vector_length(n) <= 32);
10457   match(Set dst (MaskAll src));
10458   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10459   ins_encode %{
10460     int mask_len = Matcher::vector_length(this);
10461     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10462   %}
10463   ins_pipe( pipe_slow );
10464 %}
10465 
10466 #ifdef _LP64
10467 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10468   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10469   match(Set dst (XorVMask src (MaskAll cnt)));
10470   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10471   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10472   ins_encode %{
10473     uint masklen = Matcher::vector_length(this);
10474     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10475   %}
10476   ins_pipe( pipe_slow );
10477 %}
10478 
10479 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10480   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10481             (Matcher::vector_length(n) == 16) ||
10482             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10483   match(Set dst (XorVMask src (MaskAll cnt)));
10484   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10485   ins_encode %{
10486     uint masklen = Matcher::vector_length(this);
10487     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10488   %}
10489   ins_pipe( pipe_slow );
10490 %}
10491 
10492 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10493   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10494   match(Set dst (VectorLongToMask src));
10495   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10496   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10497   ins_encode %{
10498     int mask_len = Matcher::vector_length(this);
10499     int vec_enc  = vector_length_encoding(mask_len);
10500     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10501                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10502   %}
10503   ins_pipe( pipe_slow );
10504 %}
10505 
10506 
10507 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10508   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10509   match(Set dst (VectorLongToMask src));
10510   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10511   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10512   ins_encode %{
10513     int mask_len = Matcher::vector_length(this);
10514     assert(mask_len <= 32, "invalid mask length");
10515     int vec_enc  = vector_length_encoding(mask_len);
10516     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10517                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10518   %}
10519   ins_pipe( pipe_slow );
10520 %}
10521 
10522 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10523   predicate(n->bottom_type()->isa_vectmask());
10524   match(Set dst (VectorLongToMask src));
10525   format %{ "long_to_mask_evex $dst, $src\t!" %}
10526   ins_encode %{
10527     __ kmov($dst$$KRegister, $src$$Register);
10528   %}
10529   ins_pipe( pipe_slow );
10530 %}
10531 #endif
10532 
10533 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10534   match(Set dst (AndVMask src1 src2));
10535   match(Set dst (OrVMask src1 src2));
10536   match(Set dst (XorVMask src1 src2));
10537   effect(TEMP kscratch);
10538   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10539   ins_encode %{
10540     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10541     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10542     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10543     uint masklen = Matcher::vector_length(this);
10544     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10545     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10546   %}
10547   ins_pipe( pipe_slow );
10548 %}
10549 
10550 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10551   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10552   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10553   ins_encode %{
10554     int vlen_enc = vector_length_encoding(this);
10555     BasicType bt = Matcher::vector_element_basic_type(this);
10556     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10557                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10558   %}
10559   ins_pipe( pipe_slow );
10560 %}
10561 
10562 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10563   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10564   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10565   ins_encode %{
10566     int vlen_enc = vector_length_encoding(this);
10567     BasicType bt = Matcher::vector_element_basic_type(this);
10568     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10569                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10570   %}
10571   ins_pipe( pipe_slow );
10572 %}
10573 
10574 instruct castMM(kReg dst)
10575 %{
10576   match(Set dst (CastVV dst));
10577 
10578   size(0);
10579   format %{ "# castVV of $dst" %}
10580   ins_encode(/* empty encoding */);
10581   ins_cost(0);
10582   ins_pipe(empty);
10583 %}
10584 
10585 instruct castVV(vec dst)
10586 %{
10587   match(Set dst (CastVV dst));
10588 
10589   size(0);
10590   format %{ "# castVV of $dst" %}
10591   ins_encode(/* empty encoding */);
10592   ins_cost(0);
10593   ins_pipe(empty);
10594 %}
10595 
10596 instruct castVVLeg(legVec dst)
10597 %{
10598   match(Set dst (CastVV dst));
10599 
10600   size(0);
10601   format %{ "# castVV of $dst" %}
10602   ins_encode(/* empty encoding */);
10603   ins_cost(0);
10604   ins_pipe(empty);
10605 %}
10606 
10607 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10608 %{
10609   match(Set dst (IsInfiniteF src));
10610   effect(TEMP ktmp, KILL cr);
10611   format %{ "float_class_check $dst, $src" %}
10612   ins_encode %{
10613     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10614     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10615   %}
10616   ins_pipe(pipe_slow);
10617 %}
10618 
10619 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10620 %{
10621   match(Set dst (IsInfiniteD src));
10622   effect(TEMP ktmp, KILL cr);
10623   format %{ "double_class_check $dst, $src" %}
10624   ins_encode %{
10625     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10626     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10627   %}
10628   ins_pipe(pipe_slow);
10629 %}
10630 
10631 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10632 %{
10633   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10634             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10635   match(Set dst (SaturatingAddV src1 src2));
10636   match(Set dst (SaturatingSubV src1 src2));
10637   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10638   ins_encode %{
10639     int vlen_enc = vector_length_encoding(this);
10640     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10641     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10642                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10643   %}
10644   ins_pipe(pipe_slow);
10645 %}
10646 
10647 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10648 %{
10649   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10650             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10651   match(Set dst (SaturatingAddV src1 src2));
10652   match(Set dst (SaturatingSubV src1 src2));
10653   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10654   ins_encode %{
10655     int vlen_enc = vector_length_encoding(this);
10656     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10657     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10658                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10659   %}
10660   ins_pipe(pipe_slow);
10661 %}
10662 
10663 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10664 %{
10665   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10666             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10667             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10668   match(Set dst (SaturatingAddV src1 src2));
10669   match(Set dst (SaturatingSubV src1 src2));
10670   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10671   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10672   ins_encode %{
10673     int vlen_enc = vector_length_encoding(this);
10674     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10675     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10676                                         $src1$$XMMRegister, $src2$$XMMRegister,
10677                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10678                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10679   %}
10680   ins_pipe(pipe_slow);
10681 %}
10682 
10683 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10684 %{
10685   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10686             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10687             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10688   match(Set dst (SaturatingAddV src1 src2));
10689   match(Set dst (SaturatingSubV src1 src2));
10690   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10691   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10692   ins_encode %{
10693     int vlen_enc = vector_length_encoding(this);
10694     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10695     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10696                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10697                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10698   %}
10699   ins_pipe(pipe_slow);
10700 %}
10701 
10702 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10703 %{
10704   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10705             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10706             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10707   match(Set dst (SaturatingAddV src1 src2));
10708   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10709   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10710   ins_encode %{
10711     int vlen_enc = vector_length_encoding(this);
10712     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10713     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10714                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10715   %}
10716   ins_pipe(pipe_slow);
10717 %}
10718 
10719 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10720 %{
10721   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10722             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10723             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10724   match(Set dst (SaturatingAddV src1 src2));
10725   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10726   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10727   ins_encode %{
10728     int vlen_enc = vector_length_encoding(this);
10729     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10730     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10731                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10732   %}
10733   ins_pipe(pipe_slow);
10734 %}
10735 
10736 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10737 %{
10738   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10739             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10740             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10741   match(Set dst (SaturatingSubV src1 src2));
10742   effect(TEMP ktmp);
10743   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10744   ins_encode %{
10745     int vlen_enc = vector_length_encoding(this);
10746     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10747     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10748                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10749   %}
10750   ins_pipe(pipe_slow);
10751 %}
10752 
10753 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10754 %{
10755   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10756             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10757             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10758   match(Set dst (SaturatingSubV src1 src2));
10759   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10760   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10761   ins_encode %{
10762     int vlen_enc = vector_length_encoding(this);
10763     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10764     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10765                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10766   %}
10767   ins_pipe(pipe_slow);
10768 %}
10769 
10770 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10771 %{
10772   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10773             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10774   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10775   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10776   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10777   ins_encode %{
10778     int vlen_enc = vector_length_encoding(this);
10779     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10780     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10781                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10782   %}
10783   ins_pipe(pipe_slow);
10784 %}
10785 
10786 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10787 %{
10788   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10789             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10790   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10791   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10792   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10793   ins_encode %{
10794     int vlen_enc = vector_length_encoding(this);
10795     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10796     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10797                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10798   %}
10799   ins_pipe(pipe_slow);
10800 %}
10801 
10802 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10803   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10804             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10805   match(Set dst (SaturatingAddV (Binary dst src) mask));
10806   match(Set dst (SaturatingSubV (Binary dst src) mask));
10807   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10808   ins_encode %{
10809     int vlen_enc = vector_length_encoding(this);
10810     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10811     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10812                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10813   %}
10814   ins_pipe( pipe_slow );
10815 %}
10816 
10817 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10818   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10819             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10820   match(Set dst (SaturatingAddV (Binary dst src) mask));
10821   match(Set dst (SaturatingSubV (Binary dst src) mask));
10822   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10823   ins_encode %{
10824     int vlen_enc = vector_length_encoding(this);
10825     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10826     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10827                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10828   %}
10829   ins_pipe( pipe_slow );
10830 %}
10831 
10832 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10833   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10834             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10835   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10836   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10837   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10838   ins_encode %{
10839     int vlen_enc = vector_length_encoding(this);
10840     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10841     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10842                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10843   %}
10844   ins_pipe( pipe_slow );
10845 %}
10846 
10847 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10848   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10849             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10850   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10851   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10852   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10853   ins_encode %{
10854     int vlen_enc = vector_length_encoding(this);
10855     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10856     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10857                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10858   %}
10859   ins_pipe( pipe_slow );
10860 %}
10861 
10862 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10863 %{
10864   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10865   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10866   ins_encode %{
10867     int vlen_enc = vector_length_encoding(this);
10868     BasicType bt = Matcher::vector_element_basic_type(this);
10869     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10870   %}
10871   ins_pipe(pipe_slow);
10872 %}