1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128) {
 1835          return false;
 1836        }
 1837        if ((size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1838          return false;
 1839        }
 1840        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1841          return false;
 1842        }
 1843        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1844          return false;
 1845        }
 1846        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1847          return false;
 1848        }
 1849        break;
 1850     case Op_MaskAll:
 1851       if (!VM_Version::supports_evex()) {
 1852         return false;
 1853       }
 1854       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1855         return false;
 1856       }
 1857       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1858         return false;
 1859       }
 1860       break;
 1861     case Op_VectorMaskCmp:
 1862       if (vlen < 2 || size_in_bits < 32) {
 1863         return false;
 1864       }
 1865       break;
 1866     case Op_CompressM:
 1867       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1868         return false;
 1869       }
 1870       break;
 1871     case Op_CompressV:
 1872     case Op_ExpandV:
 1873       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1874         return false;
 1875       }
 1876       if (size_in_bits < 128 ) {
 1877         return false;
 1878       }
 1879     case Op_VectorLongToMask:
 1880       if (UseAVX < 1) {
 1881         return false;
 1882       }
 1883       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1884         return false;
 1885       }
 1886       break;
 1887     case Op_SignumVD:
 1888     case Op_SignumVF:
 1889       if (UseAVX < 1) {
 1890         return false;
 1891       }
 1892       break;
 1893     case Op_PopCountVI:
 1894     case Op_PopCountVL: {
 1895         if (!is_pop_count_instr_target(bt) &&
 1896             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1897           return false;
 1898         }
 1899       }
 1900       break;
 1901     case Op_ReverseV:
 1902     case Op_ReverseBytesV:
 1903       if (UseAVX < 2) {
 1904         return false;
 1905       }
 1906       break;
 1907     case Op_CountTrailingZerosV:
 1908     case Op_CountLeadingZerosV:
 1909       if (UseAVX < 2) {
 1910         return false;
 1911       }
 1912       break;
 1913   }
 1914   return true;  // Per default match rules are supported.
 1915 }
 1916 
 1917 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1918   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1919   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1920   // of their non-masked counterpart with mask edge being the differentiator.
 1921   // This routine does a strict check on the existence of masked operation patterns
 1922   // by returning a default false value for all the other opcodes apart from the
 1923   // ones whose masked instruction patterns are defined in this file.
 1924   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1925     return false;
 1926   }
 1927 
 1928   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1929   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1930     return false;
 1931   }
 1932   switch(opcode) {
 1933     // Unary masked operations
 1934     case Op_AbsVB:
 1935     case Op_AbsVS:
 1936       if(!VM_Version::supports_avx512bw()) {
 1937         return false;  // Implementation limitation
 1938       }
 1939     case Op_AbsVI:
 1940     case Op_AbsVL:
 1941       return true;
 1942 
 1943     // Ternary masked operations
 1944     case Op_FmaVF:
 1945     case Op_FmaVD:
 1946       return true;
 1947 
 1948     case Op_MacroLogicV:
 1949       if(bt != T_INT && bt != T_LONG) {
 1950         return false;
 1951       }
 1952       return true;
 1953 
 1954     // Binary masked operations
 1955     case Op_AddVB:
 1956     case Op_AddVS:
 1957     case Op_SubVB:
 1958     case Op_SubVS:
 1959     case Op_MulVS:
 1960     case Op_LShiftVS:
 1961     case Op_RShiftVS:
 1962     case Op_URShiftVS:
 1963       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1964       if (!VM_Version::supports_avx512bw()) {
 1965         return false;  // Implementation limitation
 1966       }
 1967       return true;
 1968 
 1969     case Op_MulVL:
 1970       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1971       if (!VM_Version::supports_avx512dq()) {
 1972         return false;  // Implementation limitation
 1973       }
 1974       return true;
 1975 
 1976     case Op_AndV:
 1977     case Op_OrV:
 1978     case Op_XorV:
 1979     case Op_RotateRightV:
 1980     case Op_RotateLeftV:
 1981       if (bt != T_INT && bt != T_LONG) {
 1982         return false; // Implementation limitation
 1983       }
 1984       return true;
 1985 
 1986     case Op_VectorLoadMask:
 1987       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1988       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1989         return false;
 1990       }
 1991       return true;
 1992 
 1993     case Op_AddVI:
 1994     case Op_AddVL:
 1995     case Op_AddVF:
 1996     case Op_AddVD:
 1997     case Op_SubVI:
 1998     case Op_SubVL:
 1999     case Op_SubVF:
 2000     case Op_SubVD:
 2001     case Op_MulVI:
 2002     case Op_MulVF:
 2003     case Op_MulVD:
 2004     case Op_DivVF:
 2005     case Op_DivVD:
 2006     case Op_SqrtVF:
 2007     case Op_SqrtVD:
 2008     case Op_LShiftVI:
 2009     case Op_LShiftVL:
 2010     case Op_RShiftVI:
 2011     case Op_RShiftVL:
 2012     case Op_URShiftVI:
 2013     case Op_URShiftVL:
 2014     case Op_LoadVectorMasked:
 2015     case Op_StoreVectorMasked:
 2016     case Op_LoadVectorGatherMasked:
 2017     case Op_StoreVectorScatterMasked:
 2018       return true;
 2019 
 2020     case Op_UMinV:
 2021     case Op_UMaxV:
 2022       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2023         return false;
 2024       } // fallthrough
 2025     case Op_MaxV:
 2026     case Op_MinV:
 2027       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2028         return false; // Implementation limitation
 2029       }
 2030       if (is_floating_point_type(bt) && !VM_Version::supports_avx10_2()) {
 2031         return false; // Implementation limitation
 2032       }
 2033       return true;
 2034     case Op_SaturatingAddV:
 2035     case Op_SaturatingSubV:
 2036       if (!is_subword_type(bt)) {
 2037         return false;
 2038       }
 2039       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2040         return false; // Implementation limitation
 2041       }
 2042       return true;
 2043 
 2044     case Op_VectorMaskCmp:
 2045       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2046         return false; // Implementation limitation
 2047       }
 2048       return true;
 2049 
 2050     case Op_VectorRearrange:
 2051       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2052         return false; // Implementation limitation
 2053       }
 2054       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2055         return false; // Implementation limitation
 2056       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2057         return false; // Implementation limitation
 2058       }
 2059       return true;
 2060 
 2061     // Binary Logical operations
 2062     case Op_AndVMask:
 2063     case Op_OrVMask:
 2064     case Op_XorVMask:
 2065       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2066         return false; // Implementation limitation
 2067       }
 2068       return true;
 2069 
 2070     case Op_PopCountVI:
 2071     case Op_PopCountVL:
 2072       if (!is_pop_count_instr_target(bt)) {
 2073         return false;
 2074       }
 2075       return true;
 2076 
 2077     case Op_MaskAll:
 2078       return true;
 2079 
 2080     case Op_CountLeadingZerosV:
 2081       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2082         return true;
 2083       }
 2084     default:
 2085       return false;
 2086   }
 2087 }
 2088 
 2089 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2090   return false;
 2091 }
 2092 
 2093 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2094 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2095   switch (elem_bt) {
 2096     case T_BYTE:  return false;
 2097     case T_SHORT: return !VM_Version::supports_avx512bw();
 2098     case T_INT:   return !VM_Version::supports_avx();
 2099     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2100     default:
 2101       ShouldNotReachHere();
 2102       return false;
 2103   }
 2104 }
 2105 
 2106 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2107   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2108   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2109   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2110       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2111     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2112     return new legVecZOper();
 2113   }
 2114   if (legacy) {
 2115     switch (ideal_reg) {
 2116       case Op_VecS: return new legVecSOper();
 2117       case Op_VecD: return new legVecDOper();
 2118       case Op_VecX: return new legVecXOper();
 2119       case Op_VecY: return new legVecYOper();
 2120       case Op_VecZ: return new legVecZOper();
 2121     }
 2122   } else {
 2123     switch (ideal_reg) {
 2124       case Op_VecS: return new vecSOper();
 2125       case Op_VecD: return new vecDOper();
 2126       case Op_VecX: return new vecXOper();
 2127       case Op_VecY: return new vecYOper();
 2128       case Op_VecZ: return new vecZOper();
 2129     }
 2130   }
 2131   ShouldNotReachHere();
 2132   return nullptr;
 2133 }
 2134 
 2135 bool Matcher::is_reg2reg_move(MachNode* m) {
 2136   switch (m->rule()) {
 2137     case MoveVec2Leg_rule:
 2138     case MoveLeg2Vec_rule:
 2139     case MoveF2VL_rule:
 2140     case MoveF2LEG_rule:
 2141     case MoveVL2F_rule:
 2142     case MoveLEG2F_rule:
 2143     case MoveD2VL_rule:
 2144     case MoveD2LEG_rule:
 2145     case MoveVL2D_rule:
 2146     case MoveLEG2D_rule:
 2147       return true;
 2148     default:
 2149       return false;
 2150   }
 2151 }
 2152 
 2153 bool Matcher::is_generic_vector(MachOper* opnd) {
 2154   switch (opnd->opcode()) {
 2155     case VEC:
 2156     case LEGVEC:
 2157       return true;
 2158     default:
 2159       return false;
 2160   }
 2161 }
 2162 
 2163 //------------------------------------------------------------------------
 2164 
 2165 const RegMask* Matcher::predicate_reg_mask(void) {
 2166   return &_VECTMASK_REG_mask;
 2167 }
 2168 
 2169 // Max vector size in bytes. 0 if not supported.
 2170 int Matcher::vector_width_in_bytes(BasicType bt) {
 2171   assert(is_java_primitive(bt), "only primitive type vectors");
 2172   // SSE2 supports 128bit vectors for all types.
 2173   // AVX2 supports 256bit vectors for all types.
 2174   // AVX2/EVEX supports 512bit vectors for all types.
 2175   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2176   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2177   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2178     size = (UseAVX > 2) ? 64 : 32;
 2179   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2180     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2181   // Use flag to limit vector size.
 2182   size = MIN2(size,(int)MaxVectorSize);
 2183   // Minimum 2 values in vector (or 4 for bytes).
 2184   switch (bt) {
 2185   case T_DOUBLE:
 2186   case T_LONG:
 2187     if (size < 16) return 0;
 2188     break;
 2189   case T_FLOAT:
 2190   case T_INT:
 2191     if (size < 8) return 0;
 2192     break;
 2193   case T_BOOLEAN:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_CHAR:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_BYTE:
 2200     if (size < 4) return 0;
 2201     break;
 2202   case T_SHORT:
 2203     if (size < 4) return 0;
 2204     break;
 2205   default:
 2206     ShouldNotReachHere();
 2207   }
 2208   return size;
 2209 }
 2210 
 2211 // Limits on vector size (number of elements) loaded into vector.
 2212 int Matcher::max_vector_size(const BasicType bt) {
 2213   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2214 }
 2215 int Matcher::min_vector_size(const BasicType bt) {
 2216   int max_size = max_vector_size(bt);
 2217   // Min size which can be loaded into vector is 4 bytes.
 2218   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2219   // Support for calling svml double64 vectors
 2220   if (bt == T_DOUBLE) {
 2221     size = 1;
 2222   }
 2223   return MIN2(size,max_size);
 2224 }
 2225 
 2226 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2227   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2228   // by default on Cascade Lake
 2229   if (VM_Version::is_default_intel_cascade_lake()) {
 2230     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2231   }
 2232   return Matcher::max_vector_size(bt);
 2233 }
 2234 
 2235 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2236   return -1;
 2237 }
 2238 
 2239 // Vector ideal reg corresponding to specified size in bytes
 2240 uint Matcher::vector_ideal_reg(int size) {
 2241   assert(MaxVectorSize >= size, "");
 2242   switch(size) {
 2243     case  4: return Op_VecS;
 2244     case  8: return Op_VecD;
 2245     case 16: return Op_VecX;
 2246     case 32: return Op_VecY;
 2247     case 64: return Op_VecZ;
 2248   }
 2249   ShouldNotReachHere();
 2250   return 0;
 2251 }
 2252 
 2253 // Check for shift by small constant as well
 2254 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2255   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2256       shift->in(2)->get_int() <= 3 &&
 2257       // Are there other uses besides address expressions?
 2258       !matcher->is_visited(shift)) {
 2259     address_visited.set(shift->_idx); // Flag as address_visited
 2260     mstack.push(shift->in(2), Matcher::Visit);
 2261     Node *conv = shift->in(1);
 2262     // Allow Matcher to match the rule which bypass
 2263     // ConvI2L operation for an array index on LP64
 2264     // if the index value is positive.
 2265     if (conv->Opcode() == Op_ConvI2L &&
 2266         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2267         // Are there other uses besides address expressions?
 2268         !matcher->is_visited(conv)) {
 2269       address_visited.set(conv->_idx); // Flag as address_visited
 2270       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2271     } else {
 2272       mstack.push(conv, Matcher::Pre_Visit);
 2273     }
 2274     return true;
 2275   }
 2276   return false;
 2277 }
 2278 
 2279 // This function identifies sub-graphs in which a 'load' node is
 2280 // input to two different nodes, and such that it can be matched
 2281 // with BMI instructions like blsi, blsr, etc.
 2282 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2283 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2284 // refers to the same node.
 2285 //
 2286 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2287 // This is a temporary solution until we make DAGs expressible in ADL.
 2288 template<typename ConType>
 2289 class FusedPatternMatcher {
 2290   Node* _op1_node;
 2291   Node* _mop_node;
 2292   int _con_op;
 2293 
 2294   static int match_next(Node* n, int next_op, int next_op_idx) {
 2295     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2296       return -1;
 2297     }
 2298 
 2299     if (next_op_idx == -1) { // n is commutative, try rotations
 2300       if (n->in(1)->Opcode() == next_op) {
 2301         return 1;
 2302       } else if (n->in(2)->Opcode() == next_op) {
 2303         return 2;
 2304       }
 2305     } else {
 2306       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2307       if (n->in(next_op_idx)->Opcode() == next_op) {
 2308         return next_op_idx;
 2309       }
 2310     }
 2311     return -1;
 2312   }
 2313 
 2314  public:
 2315   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2316     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2317 
 2318   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2319              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2320              typename ConType::NativeType con_value) {
 2321     if (_op1_node->Opcode() != op1) {
 2322       return false;
 2323     }
 2324     if (_mop_node->outcnt() > 2) {
 2325       return false;
 2326     }
 2327     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2328     if (op1_op2_idx == -1) {
 2329       return false;
 2330     }
 2331     // Memory operation must be the other edge
 2332     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2333 
 2334     // Check that the mop node is really what we want
 2335     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2336       Node* op2_node = _op1_node->in(op1_op2_idx);
 2337       if (op2_node->outcnt() > 1) {
 2338         return false;
 2339       }
 2340       assert(op2_node->Opcode() == op2, "Should be");
 2341       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2342       if (op2_con_idx == -1) {
 2343         return false;
 2344       }
 2345       // Memory operation must be the other edge
 2346       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2347       // Check that the memory operation is the same node
 2348       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2349         // Now check the constant
 2350         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2351         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2352           return true;
 2353         }
 2354       }
 2355     }
 2356     return false;
 2357   }
 2358 };
 2359 
 2360 static bool is_bmi_pattern(Node* n, Node* m) {
 2361   assert(UseBMI1Instructions, "sanity");
 2362   if (n != nullptr && m != nullptr) {
 2363     if (m->Opcode() == Op_LoadI) {
 2364       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2365       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2366              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2367              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2368     } else if (m->Opcode() == Op_LoadL) {
 2369       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2370       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2371              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2372              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2373     }
 2374   }
 2375   return false;
 2376 }
 2377 
 2378 // Should the matcher clone input 'm' of node 'n'?
 2379 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2380   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2381   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2382     mstack.push(m, Visit);
 2383     return true;
 2384   }
 2385   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2386     mstack.push(m, Visit);           // m = ShiftCntV
 2387     return true;
 2388   }
 2389   if (is_encode_and_store_pattern(n, m)) {
 2390     mstack.push(m, Visit);
 2391     return true;
 2392   }
 2393   return false;
 2394 }
 2395 
 2396 // Should the Matcher clone shifts on addressing modes, expecting them
 2397 // to be subsumed into complex addressing expressions or compute them
 2398 // into registers?
 2399 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2400   Node *off = m->in(AddPNode::Offset);
 2401   if (off->is_Con()) {
 2402     address_visited.test_set(m->_idx); // Flag as address_visited
 2403     Node *adr = m->in(AddPNode::Address);
 2404 
 2405     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2406     // AtomicAdd is not an addressing expression.
 2407     // Cheap to find it by looking for screwy base.
 2408     if (adr->is_AddP() &&
 2409         !adr->in(AddPNode::Base)->is_top() &&
 2410         !adr->in(AddPNode::Offset)->is_Con() &&
 2411         off->get_long() == (int) (off->get_long()) && // immL32
 2412         // Are there other uses besides address expressions?
 2413         !is_visited(adr)) {
 2414       address_visited.set(adr->_idx); // Flag as address_visited
 2415       Node *shift = adr->in(AddPNode::Offset);
 2416       if (!clone_shift(shift, this, mstack, address_visited)) {
 2417         mstack.push(shift, Pre_Visit);
 2418       }
 2419       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2420       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2421     } else {
 2422       mstack.push(adr, Pre_Visit);
 2423     }
 2424 
 2425     // Clone X+offset as it also folds into most addressing expressions
 2426     mstack.push(off, Visit);
 2427     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2428     return true;
 2429   } else if (clone_shift(off, this, mstack, address_visited)) {
 2430     address_visited.test_set(m->_idx); // Flag as address_visited
 2431     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2432     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2433     return true;
 2434   }
 2435   return false;
 2436 }
 2437 
 2438 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2439   switch (bt) {
 2440     case BoolTest::eq:
 2441       return Assembler::eq;
 2442     case BoolTest::ne:
 2443       return Assembler::neq;
 2444     case BoolTest::le:
 2445     case BoolTest::ule:
 2446       return Assembler::le;
 2447     case BoolTest::ge:
 2448     case BoolTest::uge:
 2449       return Assembler::nlt;
 2450     case BoolTest::lt:
 2451     case BoolTest::ult:
 2452       return Assembler::lt;
 2453     case BoolTest::gt:
 2454     case BoolTest::ugt:
 2455       return Assembler::nle;
 2456     default : ShouldNotReachHere(); return Assembler::_false;
 2457   }
 2458 }
 2459 
 2460 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2461   switch (bt) {
 2462   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2463   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2464   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2465   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2466   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2467   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2468   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2469   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2470   }
 2471 }
 2472 
 2473 // Helper methods for MachSpillCopyNode::implementation().
 2474 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2475                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2476   assert(ireg == Op_VecS || // 32bit vector
 2477          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2478           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2479          "no non-adjacent vector moves" );
 2480   if (masm) {
 2481     switch (ireg) {
 2482     case Op_VecS: // copy whole register
 2483     case Op_VecD:
 2484     case Op_VecX:
 2485       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2486         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2487       } else {
 2488         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2489      }
 2490       break;
 2491     case Op_VecY:
 2492       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2493         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2494       } else {
 2495         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2496      }
 2497       break;
 2498     case Op_VecZ:
 2499       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2500       break;
 2501     default:
 2502       ShouldNotReachHere();
 2503     }
 2504 #ifndef PRODUCT
 2505   } else {
 2506     switch (ireg) {
 2507     case Op_VecS:
 2508     case Op_VecD:
 2509     case Op_VecX:
 2510       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2511       break;
 2512     case Op_VecY:
 2513     case Op_VecZ:
 2514       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2515       break;
 2516     default:
 2517       ShouldNotReachHere();
 2518     }
 2519 #endif
 2520   }
 2521 }
 2522 
 2523 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2524                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2525   if (masm) {
 2526     if (is_load) {
 2527       switch (ireg) {
 2528       case Op_VecS:
 2529         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecD:
 2532         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2533         break;
 2534       case Op_VecX:
 2535         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2536           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2537         } else {
 2538           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2539           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2540         }
 2541         break;
 2542       case Op_VecY:
 2543         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2544           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2545         } else {
 2546           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2547           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2548         }
 2549         break;
 2550       case Op_VecZ:
 2551         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2552         break;
 2553       default:
 2554         ShouldNotReachHere();
 2555       }
 2556     } else { // store
 2557       switch (ireg) {
 2558       case Op_VecS:
 2559         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecD:
 2562         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2563         break;
 2564       case Op_VecX:
 2565         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2566           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2567         }
 2568         else {
 2569           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2570         }
 2571         break;
 2572       case Op_VecY:
 2573         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2574           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2575         }
 2576         else {
 2577           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2578         }
 2579         break;
 2580       case Op_VecZ:
 2581         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2582         break;
 2583       default:
 2584         ShouldNotReachHere();
 2585       }
 2586     }
 2587 #ifndef PRODUCT
 2588   } else {
 2589     if (is_load) {
 2590       switch (ireg) {
 2591       case Op_VecS:
 2592         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594       case Op_VecD:
 2595         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597        case Op_VecX:
 2598         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2599         break;
 2600       case Op_VecY:
 2601       case Op_VecZ:
 2602         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2603         break;
 2604       default:
 2605         ShouldNotReachHere();
 2606       }
 2607     } else { // store
 2608       switch (ireg) {
 2609       case Op_VecS:
 2610         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612       case Op_VecD:
 2613         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615        case Op_VecX:
 2616         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2617         break;
 2618       case Op_VecY:
 2619       case Op_VecZ:
 2620         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2621         break;
 2622       default:
 2623         ShouldNotReachHere();
 2624       }
 2625     }
 2626 #endif
 2627   }
 2628 }
 2629 
 2630 template <class T>
 2631 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2632   int size = type2aelembytes(bt) * len;
 2633   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2634   for (int i = 0; i < len; i++) {
 2635     int offset = i * type2aelembytes(bt);
 2636     switch (bt) {
 2637       case T_BYTE: val->at(i) = con; break;
 2638       case T_SHORT: {
 2639         jshort c = con;
 2640         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2641         break;
 2642       }
 2643       case T_INT: {
 2644         jint c = con;
 2645         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2646         break;
 2647       }
 2648       case T_LONG: {
 2649         jlong c = con;
 2650         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2651         break;
 2652       }
 2653       case T_FLOAT: {
 2654         jfloat c = con;
 2655         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2656         break;
 2657       }
 2658       case T_DOUBLE: {
 2659         jdouble c = con;
 2660         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2661         break;
 2662       }
 2663       default: assert(false, "%s", type2name(bt));
 2664     }
 2665   }
 2666   return val;
 2667 }
 2668 
 2669 static inline jlong high_bit_set(BasicType bt) {
 2670   switch (bt) {
 2671     case T_BYTE:  return 0x8080808080808080;
 2672     case T_SHORT: return 0x8000800080008000;
 2673     case T_INT:   return 0x8000000080000000;
 2674     case T_LONG:  return 0x8000000000000000;
 2675     default:
 2676       ShouldNotReachHere();
 2677       return 0;
 2678   }
 2679 }
 2680 
 2681 #ifndef PRODUCT
 2682   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2683     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2684   }
 2685 #endif
 2686 
 2687   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2688     __ nop(_count);
 2689   }
 2690 
 2691   uint MachNopNode::size(PhaseRegAlloc*) const {
 2692     return _count;
 2693   }
 2694 
 2695 #ifndef PRODUCT
 2696   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2697     st->print("# breakpoint");
 2698   }
 2699 #endif
 2700 
 2701   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2702     __ int3();
 2703   }
 2704 
 2705   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2706     return MachNode::size(ra_);
 2707   }
 2708 
 2709 %}
 2710 
 2711 encode %{
 2712 
 2713   enc_class call_epilog %{
 2714     if (VerifyStackAtCalls) {
 2715       // Check that stack depth is unchanged: find majik cookie on stack
 2716       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2717       Label L;
 2718       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2719       __ jccb(Assembler::equal, L);
 2720       // Die if stack mismatch
 2721       __ int3();
 2722       __ bind(L);
 2723     }
 2724     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic() && _method->return_type()->is_loaded()) {
 2725       // The last return value is not set by the callee but used to pass the null marker to compiled code.
 2726       // Search for the corresponding projection, get the register and emit code that initialized it.
 2727       uint con = (tf()->range_cc()->cnt() - 1);
 2728       for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2729         ProjNode* proj = fast_out(i)->as_Proj();
 2730         if (proj->_con == con) {
 2731           // Set null marker if rax is non-null (a non-null value is returned buffered or scalarized)
 2732           OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2733           VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2734           Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2735           __ testq(rax, rax);
 2736           __ setb(Assembler::notZero, toReg);
 2737           __ movzbl(toReg, toReg);
 2738           if (reg->is_stack()) {
 2739             int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2740             __ movq(Address(rsp, st_off), toReg);
 2741           }
 2742           break;
 2743         }
 2744       }
 2745       if (return_value_is_used()) {
 2746         // An inline type is returned as fields in multiple registers.
 2747         // Rax either contains an oop if the inline type is buffered or a pointer
 2748         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2749         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2750         // rax &= (rax & 1) - 1
 2751         __ movptr(rscratch1, rax);
 2752         __ andptr(rscratch1, 0x1);
 2753         __ subptr(rscratch1, 0x1);
 2754         __ andptr(rax, rscratch1);
 2755       }
 2756     }
 2757   %}
 2758 
 2759 %}
 2760 
 2761 // Operands for bound floating pointer register arguments
 2762 operand rxmm0() %{
 2763   constraint(ALLOC_IN_RC(xmm0_reg));
 2764   match(VecX);
 2765   format%{%}
 2766   interface(REG_INTER);
 2767 %}
 2768 
 2769 //----------OPERANDS-----------------------------------------------------------
 2770 // Operand definitions must precede instruction definitions for correct parsing
 2771 // in the ADLC because operands constitute user defined types which are used in
 2772 // instruction definitions.
 2773 
 2774 // Vectors
 2775 
 2776 // Dummy generic vector class. Should be used for all vector operands.
 2777 // Replaced with vec[SDXYZ] during post-selection pass.
 2778 operand vec() %{
 2779   constraint(ALLOC_IN_RC(dynamic));
 2780   match(VecX);
 2781   match(VecY);
 2782   match(VecZ);
 2783   match(VecS);
 2784   match(VecD);
 2785 
 2786   format %{ %}
 2787   interface(REG_INTER);
 2788 %}
 2789 
 2790 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2791 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2792 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2793 // runtime code generation via reg_class_dynamic.
 2794 operand legVec() %{
 2795   constraint(ALLOC_IN_RC(dynamic));
 2796   match(VecX);
 2797   match(VecY);
 2798   match(VecZ);
 2799   match(VecS);
 2800   match(VecD);
 2801 
 2802   format %{ %}
 2803   interface(REG_INTER);
 2804 %}
 2805 
 2806 // Replaces vec during post-selection cleanup. See above.
 2807 operand vecS() %{
 2808   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2809   match(VecS);
 2810 
 2811   format %{ %}
 2812   interface(REG_INTER);
 2813 %}
 2814 
 2815 // Replaces legVec during post-selection cleanup. See above.
 2816 operand legVecS() %{
 2817   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2818   match(VecS);
 2819 
 2820   format %{ %}
 2821   interface(REG_INTER);
 2822 %}
 2823 
 2824 // Replaces vec during post-selection cleanup. See above.
 2825 operand vecD() %{
 2826   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2827   match(VecD);
 2828 
 2829   format %{ %}
 2830   interface(REG_INTER);
 2831 %}
 2832 
 2833 // Replaces legVec during post-selection cleanup. See above.
 2834 operand legVecD() %{
 2835   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2836   match(VecD);
 2837 
 2838   format %{ %}
 2839   interface(REG_INTER);
 2840 %}
 2841 
 2842 // Replaces vec during post-selection cleanup. See above.
 2843 operand vecX() %{
 2844   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2845   match(VecX);
 2846 
 2847   format %{ %}
 2848   interface(REG_INTER);
 2849 %}
 2850 
 2851 // Replaces legVec during post-selection cleanup. See above.
 2852 operand legVecX() %{
 2853   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2854   match(VecX);
 2855 
 2856   format %{ %}
 2857   interface(REG_INTER);
 2858 %}
 2859 
 2860 // Replaces vec during post-selection cleanup. See above.
 2861 operand vecY() %{
 2862   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2863   match(VecY);
 2864 
 2865   format %{ %}
 2866   interface(REG_INTER);
 2867 %}
 2868 
 2869 // Replaces legVec during post-selection cleanup. See above.
 2870 operand legVecY() %{
 2871   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2872   match(VecY);
 2873 
 2874   format %{ %}
 2875   interface(REG_INTER);
 2876 %}
 2877 
 2878 // Replaces vec during post-selection cleanup. See above.
 2879 operand vecZ() %{
 2880   constraint(ALLOC_IN_RC(vectorz_reg));
 2881   match(VecZ);
 2882 
 2883   format %{ %}
 2884   interface(REG_INTER);
 2885 %}
 2886 
 2887 // Replaces legVec during post-selection cleanup. See above.
 2888 operand legVecZ() %{
 2889   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2890   match(VecZ);
 2891 
 2892   format %{ %}
 2893   interface(REG_INTER);
 2894 %}
 2895 
 2896 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2897 
 2898 // ============================================================================
 2899 
 2900 instruct ShouldNotReachHere() %{
 2901   match(Halt);
 2902   format %{ "stop\t# ShouldNotReachHere" %}
 2903   ins_encode %{
 2904     if (is_reachable()) {
 2905       const char* str = __ code_string(_halt_reason);
 2906       __ stop(str);
 2907     }
 2908   %}
 2909   ins_pipe(pipe_slow);
 2910 %}
 2911 
 2912 // ============================================================================
 2913 
 2914 instruct addF_reg(regF dst, regF src) %{
 2915   predicate(UseAVX == 0);
 2916   match(Set dst (AddF dst src));
 2917 
 2918   format %{ "addss   $dst, $src" %}
 2919   ins_cost(150);
 2920   ins_encode %{
 2921     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2922   %}
 2923   ins_pipe(pipe_slow);
 2924 %}
 2925 
 2926 instruct addF_mem(regF dst, memory src) %{
 2927   predicate(UseAVX == 0);
 2928   match(Set dst (AddF dst (LoadF src)));
 2929 
 2930   format %{ "addss   $dst, $src" %}
 2931   ins_cost(150);
 2932   ins_encode %{
 2933     __ addss($dst$$XMMRegister, $src$$Address);
 2934   %}
 2935   ins_pipe(pipe_slow);
 2936 %}
 2937 
 2938 instruct addF_imm(regF dst, immF con) %{
 2939   predicate(UseAVX == 0);
 2940   match(Set dst (AddF dst con));
 2941   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2942   ins_cost(150);
 2943   ins_encode %{
 2944     __ addss($dst$$XMMRegister, $constantaddress($con));
 2945   %}
 2946   ins_pipe(pipe_slow);
 2947 %}
 2948 
 2949 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2950   predicate(UseAVX > 0);
 2951   match(Set dst (AddF src1 src2));
 2952 
 2953   format %{ "vaddss  $dst, $src1, $src2" %}
 2954   ins_cost(150);
 2955   ins_encode %{
 2956     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2957   %}
 2958   ins_pipe(pipe_slow);
 2959 %}
 2960 
 2961 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2962   predicate(UseAVX > 0);
 2963   match(Set dst (AddF src1 (LoadF src2)));
 2964 
 2965   format %{ "vaddss  $dst, $src1, $src2" %}
 2966   ins_cost(150);
 2967   ins_encode %{
 2968     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2969   %}
 2970   ins_pipe(pipe_slow);
 2971 %}
 2972 
 2973 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2974   predicate(UseAVX > 0);
 2975   match(Set dst (AddF src con));
 2976 
 2977   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2978   ins_cost(150);
 2979   ins_encode %{
 2980     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2981   %}
 2982   ins_pipe(pipe_slow);
 2983 %}
 2984 
 2985 instruct addD_reg(regD dst, regD src) %{
 2986   predicate(UseAVX == 0);
 2987   match(Set dst (AddD dst src));
 2988 
 2989   format %{ "addsd   $dst, $src" %}
 2990   ins_cost(150);
 2991   ins_encode %{
 2992     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2993   %}
 2994   ins_pipe(pipe_slow);
 2995 %}
 2996 
 2997 instruct addD_mem(regD dst, memory src) %{
 2998   predicate(UseAVX == 0);
 2999   match(Set dst (AddD dst (LoadD src)));
 3000 
 3001   format %{ "addsd   $dst, $src" %}
 3002   ins_cost(150);
 3003   ins_encode %{
 3004     __ addsd($dst$$XMMRegister, $src$$Address);
 3005   %}
 3006   ins_pipe(pipe_slow);
 3007 %}
 3008 
 3009 instruct addD_imm(regD dst, immD con) %{
 3010   predicate(UseAVX == 0);
 3011   match(Set dst (AddD dst con));
 3012   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3013   ins_cost(150);
 3014   ins_encode %{
 3015     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3016   %}
 3017   ins_pipe(pipe_slow);
 3018 %}
 3019 
 3020 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3021   predicate(UseAVX > 0);
 3022   match(Set dst (AddD src1 src2));
 3023 
 3024   format %{ "vaddsd  $dst, $src1, $src2" %}
 3025   ins_cost(150);
 3026   ins_encode %{
 3027     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3028   %}
 3029   ins_pipe(pipe_slow);
 3030 %}
 3031 
 3032 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3033   predicate(UseAVX > 0);
 3034   match(Set dst (AddD src1 (LoadD src2)));
 3035 
 3036   format %{ "vaddsd  $dst, $src1, $src2" %}
 3037   ins_cost(150);
 3038   ins_encode %{
 3039     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3040   %}
 3041   ins_pipe(pipe_slow);
 3042 %}
 3043 
 3044 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3045   predicate(UseAVX > 0);
 3046   match(Set dst (AddD src con));
 3047 
 3048   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3049   ins_cost(150);
 3050   ins_encode %{
 3051     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3052   %}
 3053   ins_pipe(pipe_slow);
 3054 %}
 3055 
 3056 instruct subF_reg(regF dst, regF src) %{
 3057   predicate(UseAVX == 0);
 3058   match(Set dst (SubF dst src));
 3059 
 3060   format %{ "subss   $dst, $src" %}
 3061   ins_cost(150);
 3062   ins_encode %{
 3063     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3064   %}
 3065   ins_pipe(pipe_slow);
 3066 %}
 3067 
 3068 instruct subF_mem(regF dst, memory src) %{
 3069   predicate(UseAVX == 0);
 3070   match(Set dst (SubF dst (LoadF src)));
 3071 
 3072   format %{ "subss   $dst, $src" %}
 3073   ins_cost(150);
 3074   ins_encode %{
 3075     __ subss($dst$$XMMRegister, $src$$Address);
 3076   %}
 3077   ins_pipe(pipe_slow);
 3078 %}
 3079 
 3080 instruct subF_imm(regF dst, immF con) %{
 3081   predicate(UseAVX == 0);
 3082   match(Set dst (SubF dst con));
 3083   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3084   ins_cost(150);
 3085   ins_encode %{
 3086     __ subss($dst$$XMMRegister, $constantaddress($con));
 3087   %}
 3088   ins_pipe(pipe_slow);
 3089 %}
 3090 
 3091 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3092   predicate(UseAVX > 0);
 3093   match(Set dst (SubF src1 src2));
 3094 
 3095   format %{ "vsubss  $dst, $src1, $src2" %}
 3096   ins_cost(150);
 3097   ins_encode %{
 3098     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3099   %}
 3100   ins_pipe(pipe_slow);
 3101 %}
 3102 
 3103 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3104   predicate(UseAVX > 0);
 3105   match(Set dst (SubF src1 (LoadF src2)));
 3106 
 3107   format %{ "vsubss  $dst, $src1, $src2" %}
 3108   ins_cost(150);
 3109   ins_encode %{
 3110     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3111   %}
 3112   ins_pipe(pipe_slow);
 3113 %}
 3114 
 3115 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3116   predicate(UseAVX > 0);
 3117   match(Set dst (SubF src con));
 3118 
 3119   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3120   ins_cost(150);
 3121   ins_encode %{
 3122     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3123   %}
 3124   ins_pipe(pipe_slow);
 3125 %}
 3126 
 3127 instruct subD_reg(regD dst, regD src) %{
 3128   predicate(UseAVX == 0);
 3129   match(Set dst (SubD dst src));
 3130 
 3131   format %{ "subsd   $dst, $src" %}
 3132   ins_cost(150);
 3133   ins_encode %{
 3134     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3135   %}
 3136   ins_pipe(pipe_slow);
 3137 %}
 3138 
 3139 instruct subD_mem(regD dst, memory src) %{
 3140   predicate(UseAVX == 0);
 3141   match(Set dst (SubD dst (LoadD src)));
 3142 
 3143   format %{ "subsd   $dst, $src" %}
 3144   ins_cost(150);
 3145   ins_encode %{
 3146     __ subsd($dst$$XMMRegister, $src$$Address);
 3147   %}
 3148   ins_pipe(pipe_slow);
 3149 %}
 3150 
 3151 instruct subD_imm(regD dst, immD con) %{
 3152   predicate(UseAVX == 0);
 3153   match(Set dst (SubD dst con));
 3154   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3155   ins_cost(150);
 3156   ins_encode %{
 3157     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3158   %}
 3159   ins_pipe(pipe_slow);
 3160 %}
 3161 
 3162 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3163   predicate(UseAVX > 0);
 3164   match(Set dst (SubD src1 src2));
 3165 
 3166   format %{ "vsubsd  $dst, $src1, $src2" %}
 3167   ins_cost(150);
 3168   ins_encode %{
 3169     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3170   %}
 3171   ins_pipe(pipe_slow);
 3172 %}
 3173 
 3174 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3175   predicate(UseAVX > 0);
 3176   match(Set dst (SubD src1 (LoadD src2)));
 3177 
 3178   format %{ "vsubsd  $dst, $src1, $src2" %}
 3179   ins_cost(150);
 3180   ins_encode %{
 3181     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3182   %}
 3183   ins_pipe(pipe_slow);
 3184 %}
 3185 
 3186 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3187   predicate(UseAVX > 0);
 3188   match(Set dst (SubD src con));
 3189 
 3190   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3191   ins_cost(150);
 3192   ins_encode %{
 3193     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3194   %}
 3195   ins_pipe(pipe_slow);
 3196 %}
 3197 
 3198 instruct mulF_reg(regF dst, regF src) %{
 3199   predicate(UseAVX == 0);
 3200   match(Set dst (MulF dst src));
 3201 
 3202   format %{ "mulss   $dst, $src" %}
 3203   ins_cost(150);
 3204   ins_encode %{
 3205     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3206   %}
 3207   ins_pipe(pipe_slow);
 3208 %}
 3209 
 3210 instruct mulF_mem(regF dst, memory src) %{
 3211   predicate(UseAVX == 0);
 3212   match(Set dst (MulF dst (LoadF src)));
 3213 
 3214   format %{ "mulss   $dst, $src" %}
 3215   ins_cost(150);
 3216   ins_encode %{
 3217     __ mulss($dst$$XMMRegister, $src$$Address);
 3218   %}
 3219   ins_pipe(pipe_slow);
 3220 %}
 3221 
 3222 instruct mulF_imm(regF dst, immF con) %{
 3223   predicate(UseAVX == 0);
 3224   match(Set dst (MulF dst con));
 3225   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3226   ins_cost(150);
 3227   ins_encode %{
 3228     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3229   %}
 3230   ins_pipe(pipe_slow);
 3231 %}
 3232 
 3233 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3234   predicate(UseAVX > 0);
 3235   match(Set dst (MulF src1 src2));
 3236 
 3237   format %{ "vmulss  $dst, $src1, $src2" %}
 3238   ins_cost(150);
 3239   ins_encode %{
 3240     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3241   %}
 3242   ins_pipe(pipe_slow);
 3243 %}
 3244 
 3245 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3246   predicate(UseAVX > 0);
 3247   match(Set dst (MulF src1 (LoadF src2)));
 3248 
 3249   format %{ "vmulss  $dst, $src1, $src2" %}
 3250   ins_cost(150);
 3251   ins_encode %{
 3252     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3253   %}
 3254   ins_pipe(pipe_slow);
 3255 %}
 3256 
 3257 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3258   predicate(UseAVX > 0);
 3259   match(Set dst (MulF src con));
 3260 
 3261   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3262   ins_cost(150);
 3263   ins_encode %{
 3264     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3265   %}
 3266   ins_pipe(pipe_slow);
 3267 %}
 3268 
 3269 instruct mulD_reg(regD dst, regD src) %{
 3270   predicate(UseAVX == 0);
 3271   match(Set dst (MulD dst src));
 3272 
 3273   format %{ "mulsd   $dst, $src" %}
 3274   ins_cost(150);
 3275   ins_encode %{
 3276     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3277   %}
 3278   ins_pipe(pipe_slow);
 3279 %}
 3280 
 3281 instruct mulD_mem(regD dst, memory src) %{
 3282   predicate(UseAVX == 0);
 3283   match(Set dst (MulD dst (LoadD src)));
 3284 
 3285   format %{ "mulsd   $dst, $src" %}
 3286   ins_cost(150);
 3287   ins_encode %{
 3288     __ mulsd($dst$$XMMRegister, $src$$Address);
 3289   %}
 3290   ins_pipe(pipe_slow);
 3291 %}
 3292 
 3293 instruct mulD_imm(regD dst, immD con) %{
 3294   predicate(UseAVX == 0);
 3295   match(Set dst (MulD dst con));
 3296   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3297   ins_cost(150);
 3298   ins_encode %{
 3299     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3300   %}
 3301   ins_pipe(pipe_slow);
 3302 %}
 3303 
 3304 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3305   predicate(UseAVX > 0);
 3306   match(Set dst (MulD src1 src2));
 3307 
 3308   format %{ "vmulsd  $dst, $src1, $src2" %}
 3309   ins_cost(150);
 3310   ins_encode %{
 3311     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3312   %}
 3313   ins_pipe(pipe_slow);
 3314 %}
 3315 
 3316 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3317   predicate(UseAVX > 0);
 3318   match(Set dst (MulD src1 (LoadD src2)));
 3319 
 3320   format %{ "vmulsd  $dst, $src1, $src2" %}
 3321   ins_cost(150);
 3322   ins_encode %{
 3323     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3324   %}
 3325   ins_pipe(pipe_slow);
 3326 %}
 3327 
 3328 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3329   predicate(UseAVX > 0);
 3330   match(Set dst (MulD src con));
 3331 
 3332   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3333   ins_cost(150);
 3334   ins_encode %{
 3335     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3336   %}
 3337   ins_pipe(pipe_slow);
 3338 %}
 3339 
 3340 instruct divF_reg(regF dst, regF src) %{
 3341   predicate(UseAVX == 0);
 3342   match(Set dst (DivF dst src));
 3343 
 3344   format %{ "divss   $dst, $src" %}
 3345   ins_cost(150);
 3346   ins_encode %{
 3347     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3348   %}
 3349   ins_pipe(pipe_slow);
 3350 %}
 3351 
 3352 instruct divF_mem(regF dst, memory src) %{
 3353   predicate(UseAVX == 0);
 3354   match(Set dst (DivF dst (LoadF src)));
 3355 
 3356   format %{ "divss   $dst, $src" %}
 3357   ins_cost(150);
 3358   ins_encode %{
 3359     __ divss($dst$$XMMRegister, $src$$Address);
 3360   %}
 3361   ins_pipe(pipe_slow);
 3362 %}
 3363 
 3364 instruct divF_imm(regF dst, immF con) %{
 3365   predicate(UseAVX == 0);
 3366   match(Set dst (DivF dst con));
 3367   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3368   ins_cost(150);
 3369   ins_encode %{
 3370     __ divss($dst$$XMMRegister, $constantaddress($con));
 3371   %}
 3372   ins_pipe(pipe_slow);
 3373 %}
 3374 
 3375 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3376   predicate(UseAVX > 0);
 3377   match(Set dst (DivF src1 src2));
 3378 
 3379   format %{ "vdivss  $dst, $src1, $src2" %}
 3380   ins_cost(150);
 3381   ins_encode %{
 3382     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3383   %}
 3384   ins_pipe(pipe_slow);
 3385 %}
 3386 
 3387 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3388   predicate(UseAVX > 0);
 3389   match(Set dst (DivF src1 (LoadF src2)));
 3390 
 3391   format %{ "vdivss  $dst, $src1, $src2" %}
 3392   ins_cost(150);
 3393   ins_encode %{
 3394     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3395   %}
 3396   ins_pipe(pipe_slow);
 3397 %}
 3398 
 3399 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3400   predicate(UseAVX > 0);
 3401   match(Set dst (DivF src con));
 3402 
 3403   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3404   ins_cost(150);
 3405   ins_encode %{
 3406     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3407   %}
 3408   ins_pipe(pipe_slow);
 3409 %}
 3410 
 3411 instruct divD_reg(regD dst, regD src) %{
 3412   predicate(UseAVX == 0);
 3413   match(Set dst (DivD dst src));
 3414 
 3415   format %{ "divsd   $dst, $src" %}
 3416   ins_cost(150);
 3417   ins_encode %{
 3418     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3419   %}
 3420   ins_pipe(pipe_slow);
 3421 %}
 3422 
 3423 instruct divD_mem(regD dst, memory src) %{
 3424   predicate(UseAVX == 0);
 3425   match(Set dst (DivD dst (LoadD src)));
 3426 
 3427   format %{ "divsd   $dst, $src" %}
 3428   ins_cost(150);
 3429   ins_encode %{
 3430     __ divsd($dst$$XMMRegister, $src$$Address);
 3431   %}
 3432   ins_pipe(pipe_slow);
 3433 %}
 3434 
 3435 instruct divD_imm(regD dst, immD con) %{
 3436   predicate(UseAVX == 0);
 3437   match(Set dst (DivD dst con));
 3438   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3439   ins_cost(150);
 3440   ins_encode %{
 3441     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3442   %}
 3443   ins_pipe(pipe_slow);
 3444 %}
 3445 
 3446 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3447   predicate(UseAVX > 0);
 3448   match(Set dst (DivD src1 src2));
 3449 
 3450   format %{ "vdivsd  $dst, $src1, $src2" %}
 3451   ins_cost(150);
 3452   ins_encode %{
 3453     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3454   %}
 3455   ins_pipe(pipe_slow);
 3456 %}
 3457 
 3458 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3459   predicate(UseAVX > 0);
 3460   match(Set dst (DivD src1 (LoadD src2)));
 3461 
 3462   format %{ "vdivsd  $dst, $src1, $src2" %}
 3463   ins_cost(150);
 3464   ins_encode %{
 3465     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3466   %}
 3467   ins_pipe(pipe_slow);
 3468 %}
 3469 
 3470 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3471   predicate(UseAVX > 0);
 3472   match(Set dst (DivD src con));
 3473 
 3474   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3475   ins_cost(150);
 3476   ins_encode %{
 3477     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3478   %}
 3479   ins_pipe(pipe_slow);
 3480 %}
 3481 
 3482 instruct absF_reg(regF dst) %{
 3483   predicate(UseAVX == 0);
 3484   match(Set dst (AbsF dst));
 3485   ins_cost(150);
 3486   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3487   ins_encode %{
 3488     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3489   %}
 3490   ins_pipe(pipe_slow);
 3491 %}
 3492 
 3493 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3494   predicate(UseAVX > 0);
 3495   match(Set dst (AbsF src));
 3496   ins_cost(150);
 3497   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3498   ins_encode %{
 3499     int vlen_enc = Assembler::AVX_128bit;
 3500     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3501               ExternalAddress(float_signmask()), vlen_enc);
 3502   %}
 3503   ins_pipe(pipe_slow);
 3504 %}
 3505 
 3506 instruct absD_reg(regD dst) %{
 3507   predicate(UseAVX == 0);
 3508   match(Set dst (AbsD dst));
 3509   ins_cost(150);
 3510   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3511             "# abs double by sign masking" %}
 3512   ins_encode %{
 3513     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3514   %}
 3515   ins_pipe(pipe_slow);
 3516 %}
 3517 
 3518 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3519   predicate(UseAVX > 0);
 3520   match(Set dst (AbsD src));
 3521   ins_cost(150);
 3522   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3523             "# abs double by sign masking" %}
 3524   ins_encode %{
 3525     int vlen_enc = Assembler::AVX_128bit;
 3526     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3527               ExternalAddress(double_signmask()), vlen_enc);
 3528   %}
 3529   ins_pipe(pipe_slow);
 3530 %}
 3531 
 3532 instruct negF_reg(regF dst) %{
 3533   predicate(UseAVX == 0);
 3534   match(Set dst (NegF dst));
 3535   ins_cost(150);
 3536   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3537   ins_encode %{
 3538     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3539   %}
 3540   ins_pipe(pipe_slow);
 3541 %}
 3542 
 3543 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3544   predicate(UseAVX > 0);
 3545   match(Set dst (NegF src));
 3546   ins_cost(150);
 3547   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3548   ins_encode %{
 3549     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3550                  ExternalAddress(float_signflip()));
 3551   %}
 3552   ins_pipe(pipe_slow);
 3553 %}
 3554 
 3555 instruct negD_reg(regD dst) %{
 3556   predicate(UseAVX == 0);
 3557   match(Set dst (NegD dst));
 3558   ins_cost(150);
 3559   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3560             "# neg double by sign flipping" %}
 3561   ins_encode %{
 3562     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3563   %}
 3564   ins_pipe(pipe_slow);
 3565 %}
 3566 
 3567 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3568   predicate(UseAVX > 0);
 3569   match(Set dst (NegD src));
 3570   ins_cost(150);
 3571   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3572             "# neg double by sign flipping" %}
 3573   ins_encode %{
 3574     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3575                  ExternalAddress(double_signflip()));
 3576   %}
 3577   ins_pipe(pipe_slow);
 3578 %}
 3579 
 3580 // sqrtss instruction needs destination register to be pre initialized for best performance
 3581 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3582 instruct sqrtF_reg(regF dst) %{
 3583   match(Set dst (SqrtF dst));
 3584   format %{ "sqrtss  $dst, $dst" %}
 3585   ins_encode %{
 3586     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3587   %}
 3588   ins_pipe(pipe_slow);
 3589 %}
 3590 
 3591 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3592 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3593 instruct sqrtD_reg(regD dst) %{
 3594   match(Set dst (SqrtD dst));
 3595   format %{ "sqrtsd  $dst, $dst" %}
 3596   ins_encode %{
 3597     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3598   %}
 3599   ins_pipe(pipe_slow);
 3600 %}
 3601 
 3602 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3603   effect(TEMP tmp);
 3604   match(Set dst (ConvF2HF src));
 3605   ins_cost(125);
 3606   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3607   ins_encode %{
 3608     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3609   %}
 3610   ins_pipe( pipe_slow );
 3611 %}
 3612 
 3613 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3614   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3615   effect(TEMP ktmp, TEMP rtmp);
 3616   match(Set mem (StoreC mem (ConvF2HF src)));
 3617   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3618   ins_encode %{
 3619     __ movl($rtmp$$Register, 0x1);
 3620     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3621     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3622   %}
 3623   ins_pipe( pipe_slow );
 3624 %}
 3625 
 3626 instruct vconvF2HF(vec dst, vec src) %{
 3627   match(Set dst (VectorCastF2HF src));
 3628   format %{ "vector_conv_F2HF $dst $src" %}
 3629   ins_encode %{
 3630     int vlen_enc = vector_length_encoding(this, $src);
 3631     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3632   %}
 3633   ins_pipe( pipe_slow );
 3634 %}
 3635 
 3636 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3637   predicate(n->as_StoreVector()->memory_size() >= 16);
 3638   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3639   format %{ "vcvtps2ph $mem,$src" %}
 3640   ins_encode %{
 3641     int vlen_enc = vector_length_encoding(this, $src);
 3642     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3643   %}
 3644   ins_pipe( pipe_slow );
 3645 %}
 3646 
 3647 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3648   match(Set dst (ConvHF2F src));
 3649   format %{ "vcvtph2ps $dst,$src" %}
 3650   ins_encode %{
 3651     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3652   %}
 3653   ins_pipe( pipe_slow );
 3654 %}
 3655 
 3656 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3657   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3658   format %{ "vcvtph2ps $dst,$mem" %}
 3659   ins_encode %{
 3660     int vlen_enc = vector_length_encoding(this);
 3661     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3662   %}
 3663   ins_pipe( pipe_slow );
 3664 %}
 3665 
 3666 instruct vconvHF2F(vec dst, vec src) %{
 3667   match(Set dst (VectorCastHF2F src));
 3668   ins_cost(125);
 3669   format %{ "vector_conv_HF2F $dst,$src" %}
 3670   ins_encode %{
 3671     int vlen_enc = vector_length_encoding(this);
 3672     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3673   %}
 3674   ins_pipe( pipe_slow );
 3675 %}
 3676 
 3677 // ---------------------------------------- VectorReinterpret ------------------------------------
 3678 instruct reinterpret_mask(kReg dst) %{
 3679   predicate(n->bottom_type()->isa_vectmask() &&
 3680             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3681   match(Set dst (VectorReinterpret dst));
 3682   ins_cost(125);
 3683   format %{ "vector_reinterpret $dst\t!" %}
 3684   ins_encode %{
 3685     // empty
 3686   %}
 3687   ins_pipe( pipe_slow );
 3688 %}
 3689 
 3690 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3691   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3692             n->bottom_type()->isa_vectmask() &&
 3693             n->in(1)->bottom_type()->isa_vectmask() &&
 3694             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3695             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3696   match(Set dst (VectorReinterpret src));
 3697   effect(TEMP xtmp);
 3698   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3699   ins_encode %{
 3700      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3701      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3702      assert(src_sz == dst_sz , "src and dst size mismatch");
 3703      int vlen_enc = vector_length_encoding(src_sz);
 3704      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3705      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3706   %}
 3707   ins_pipe( pipe_slow );
 3708 %}
 3709 
 3710 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3711   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3712             n->bottom_type()->isa_vectmask() &&
 3713             n->in(1)->bottom_type()->isa_vectmask() &&
 3714             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3715              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3716             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3717   match(Set dst (VectorReinterpret src));
 3718   effect(TEMP xtmp);
 3719   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3720   ins_encode %{
 3721      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3722      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3723      assert(src_sz == dst_sz , "src and dst size mismatch");
 3724      int vlen_enc = vector_length_encoding(src_sz);
 3725      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3726      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3727   %}
 3728   ins_pipe( pipe_slow );
 3729 %}
 3730 
 3731 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3732   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3733             n->bottom_type()->isa_vectmask() &&
 3734             n->in(1)->bottom_type()->isa_vectmask() &&
 3735             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3736              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3737             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3738   match(Set dst (VectorReinterpret src));
 3739   effect(TEMP xtmp);
 3740   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3741   ins_encode %{
 3742      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3743      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3744      assert(src_sz == dst_sz , "src and dst size mismatch");
 3745      int vlen_enc = vector_length_encoding(src_sz);
 3746      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3747      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3748   %}
 3749   ins_pipe( pipe_slow );
 3750 %}
 3751 
 3752 instruct reinterpret(vec dst) %{
 3753   predicate(!n->bottom_type()->isa_vectmask() &&
 3754             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3755   match(Set dst (VectorReinterpret dst));
 3756   ins_cost(125);
 3757   format %{ "vector_reinterpret $dst\t!" %}
 3758   ins_encode %{
 3759     // empty
 3760   %}
 3761   ins_pipe( pipe_slow );
 3762 %}
 3763 
 3764 instruct reinterpret_expand(vec dst, vec src) %{
 3765   predicate(UseAVX == 0 &&
 3766             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3767   match(Set dst (VectorReinterpret src));
 3768   ins_cost(125);
 3769   effect(TEMP dst);
 3770   format %{ "vector_reinterpret_expand $dst,$src" %}
 3771   ins_encode %{
 3772     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3773     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3774 
 3775     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3776     if (src_vlen_in_bytes == 4) {
 3777       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3778     } else {
 3779       assert(src_vlen_in_bytes == 8, "");
 3780       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3781     }
 3782     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3783   %}
 3784   ins_pipe( pipe_slow );
 3785 %}
 3786 
 3787 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3788   predicate(UseAVX > 0 &&
 3789             !n->bottom_type()->isa_vectmask() &&
 3790             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3791             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3792   match(Set dst (VectorReinterpret src));
 3793   ins_cost(125);
 3794   format %{ "vector_reinterpret_expand $dst,$src" %}
 3795   ins_encode %{
 3796     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3797   %}
 3798   ins_pipe( pipe_slow );
 3799 %}
 3800 
 3801 
 3802 instruct vreinterpret_expand(legVec dst, vec src) %{
 3803   predicate(UseAVX > 0 &&
 3804             !n->bottom_type()->isa_vectmask() &&
 3805             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3806             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3807   match(Set dst (VectorReinterpret src));
 3808   ins_cost(125);
 3809   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3810   ins_encode %{
 3811     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3812       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3813       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3814       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3815       default: ShouldNotReachHere();
 3816     }
 3817   %}
 3818   ins_pipe( pipe_slow );
 3819 %}
 3820 
 3821 instruct reinterpret_shrink(vec dst, legVec src) %{
 3822   predicate(!n->bottom_type()->isa_vectmask() &&
 3823             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3824   match(Set dst (VectorReinterpret src));
 3825   ins_cost(125);
 3826   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3827   ins_encode %{
 3828     switch (Matcher::vector_length_in_bytes(this)) {
 3829       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3830       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3831       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3832       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3833       default: ShouldNotReachHere();
 3834     }
 3835   %}
 3836   ins_pipe( pipe_slow );
 3837 %}
 3838 
 3839 // ----------------------------------------------------------------------------------------------------
 3840 
 3841 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3842   match(Set dst (RoundDoubleMode src rmode));
 3843   format %{ "roundsd $dst,$src" %}
 3844   ins_cost(150);
 3845   ins_encode %{
 3846     assert(UseSSE >= 4, "required");
 3847     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3848       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3849     }
 3850     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3851   %}
 3852   ins_pipe(pipe_slow);
 3853 %}
 3854 
 3855 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3856   match(Set dst (RoundDoubleMode con rmode));
 3857   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3858   ins_cost(150);
 3859   ins_encode %{
 3860     assert(UseSSE >= 4, "required");
 3861     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3862   %}
 3863   ins_pipe(pipe_slow);
 3864 %}
 3865 
 3866 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3867   predicate(Matcher::vector_length(n) < 8);
 3868   match(Set dst (RoundDoubleModeV src rmode));
 3869   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3870   ins_encode %{
 3871     assert(UseAVX > 0, "required");
 3872     int vlen_enc = vector_length_encoding(this);
 3873     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3874   %}
 3875   ins_pipe( pipe_slow );
 3876 %}
 3877 
 3878 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3879   predicate(Matcher::vector_length(n) == 8);
 3880   match(Set dst (RoundDoubleModeV src rmode));
 3881   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3882   ins_encode %{
 3883     assert(UseAVX > 2, "required");
 3884     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3885   %}
 3886   ins_pipe( pipe_slow );
 3887 %}
 3888 
 3889 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3890   predicate(Matcher::vector_length(n) < 8);
 3891   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3892   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3893   ins_encode %{
 3894     assert(UseAVX > 0, "required");
 3895     int vlen_enc = vector_length_encoding(this);
 3896     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3897   %}
 3898   ins_pipe( pipe_slow );
 3899 %}
 3900 
 3901 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3902   predicate(Matcher::vector_length(n) == 8);
 3903   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3904   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3905   ins_encode %{
 3906     assert(UseAVX > 2, "required");
 3907     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3908   %}
 3909   ins_pipe( pipe_slow );
 3910 %}
 3911 
 3912 instruct onspinwait() %{
 3913   match(OnSpinWait);
 3914   ins_cost(200);
 3915 
 3916   format %{
 3917     $$template
 3918     $$emit$$"pause\t! membar_onspinwait"
 3919   %}
 3920   ins_encode %{
 3921     __ pause();
 3922   %}
 3923   ins_pipe(pipe_slow);
 3924 %}
 3925 
 3926 // a * b + c
 3927 instruct fmaD_reg(regD a, regD b, regD c) %{
 3928   match(Set c (FmaD  c (Binary a b)));
 3929   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3930   ins_cost(150);
 3931   ins_encode %{
 3932     assert(UseFMA, "Needs FMA instructions support.");
 3933     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3934   %}
 3935   ins_pipe( pipe_slow );
 3936 %}
 3937 
 3938 // a * b + c
 3939 instruct fmaF_reg(regF a, regF b, regF c) %{
 3940   match(Set c (FmaF  c (Binary a b)));
 3941   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3942   ins_cost(150);
 3943   ins_encode %{
 3944     assert(UseFMA, "Needs FMA instructions support.");
 3945     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3946   %}
 3947   ins_pipe( pipe_slow );
 3948 %}
 3949 
 3950 // ====================VECTOR INSTRUCTIONS=====================================
 3951 
 3952 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3953 instruct MoveVec2Leg(legVec dst, vec src) %{
 3954   match(Set dst src);
 3955   format %{ "" %}
 3956   ins_encode %{
 3957     ShouldNotReachHere();
 3958   %}
 3959   ins_pipe( fpu_reg_reg );
 3960 %}
 3961 
 3962 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3963   match(Set dst src);
 3964   format %{ "" %}
 3965   ins_encode %{
 3966     ShouldNotReachHere();
 3967   %}
 3968   ins_pipe( fpu_reg_reg );
 3969 %}
 3970 
 3971 // ============================================================================
 3972 
 3973 // Load vectors generic operand pattern
 3974 instruct loadV(vec dst, memory mem) %{
 3975   match(Set dst (LoadVector mem));
 3976   ins_cost(125);
 3977   format %{ "load_vector $dst,$mem" %}
 3978   ins_encode %{
 3979     BasicType bt = Matcher::vector_element_basic_type(this);
 3980     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3981   %}
 3982   ins_pipe( pipe_slow );
 3983 %}
 3984 
 3985 // Store vectors generic operand pattern.
 3986 instruct storeV(memory mem, vec src) %{
 3987   match(Set mem (StoreVector mem src));
 3988   ins_cost(145);
 3989   format %{ "store_vector $mem,$src\n\t" %}
 3990   ins_encode %{
 3991     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3992       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3993       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3994       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3995       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3996       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3997       default: ShouldNotReachHere();
 3998     }
 3999   %}
 4000   ins_pipe( pipe_slow );
 4001 %}
 4002 
 4003 // ---------------------------------------- Gather ------------------------------------
 4004 
 4005 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 4006 
 4007 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4008   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 4009             Matcher::vector_length_in_bytes(n) <= 32);
 4010   match(Set dst (LoadVectorGather mem idx));
 4011   effect(TEMP dst, TEMP tmp, TEMP mask);
 4012   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4013   ins_encode %{
 4014     int vlen_enc = vector_length_encoding(this);
 4015     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4016     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4017     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4018     __ lea($tmp$$Register, $mem$$Address);
 4019     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4020   %}
 4021   ins_pipe( pipe_slow );
 4022 %}
 4023 
 4024 
 4025 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4026   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4027             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4028   match(Set dst (LoadVectorGather mem idx));
 4029   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4030   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4031   ins_encode %{
 4032     int vlen_enc = vector_length_encoding(this);
 4033     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4034     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4035     __ lea($tmp$$Register, $mem$$Address);
 4036     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4037   %}
 4038   ins_pipe( pipe_slow );
 4039 %}
 4040 
 4041 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4042   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4043             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4044   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4045   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4046   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4047   ins_encode %{
 4048     assert(UseAVX > 2, "sanity");
 4049     int vlen_enc = vector_length_encoding(this);
 4050     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4051     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4052     // Note: Since gather instruction partially updates the opmask register used
 4053     // for predication hense moving mask operand to a temporary.
 4054     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4055     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4056     __ lea($tmp$$Register, $mem$$Address);
 4057     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4058   %}
 4059   ins_pipe( pipe_slow );
 4060 %}
 4061 
 4062 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegI rtmp) %{
 4063   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4064   match(Set dst (LoadVectorGather mem idx_base));
 4065   effect(TEMP tmp, TEMP rtmp);
 4066   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4067   ins_encode %{
 4068     int vlen_enc = vector_length_encoding(this);
 4069     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4070     __ lea($tmp$$Register, $mem$$Address);
 4071     __ vgather8b(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp$$Register, vlen_enc);
 4072   %}
 4073   ins_pipe( pipe_slow );
 4074 %}
 4075 
 4076 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegP idx_base_temp,
 4077                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4078   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4079   match(Set dst (LoadVectorGather mem idx_base));
 4080   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4081   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4082   ins_encode %{
 4083     int vlen_enc = vector_length_encoding(this);
 4084     int vector_len = Matcher::vector_length(this);
 4085     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4086     __ lea($tmp$$Register, $mem$$Address);
 4087     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4088     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $xtmp1$$XMMRegister,
 4089                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4090   %}
 4091   ins_pipe( pipe_slow );
 4092 %}
 4093 
 4094 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4095   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4096   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4097   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4098   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4099   ins_encode %{
 4100     int vlen_enc = vector_length_encoding(this);
 4101     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4102     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4103     __ lea($tmp$$Register, $mem$$Address);
 4104     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4105     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4106   %}
 4107   ins_pipe( pipe_slow );
 4108 %}
 4109 
 4110 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4111                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4112   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4113   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4114   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4115   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4116   ins_encode %{
 4117     int vlen_enc = vector_length_encoding(this);
 4118     int vector_len = Matcher::vector_length(this);
 4119     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4120     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4121     __ lea($tmp$$Register, $mem$$Address);
 4122     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4123     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4124     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4125                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4126   %}
 4127   ins_pipe( pipe_slow );
 4128 %}
 4129 
 4130 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4131   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4132   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4133   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4134   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4135   ins_encode %{
 4136     int vlen_enc = vector_length_encoding(this);
 4137     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4138     __ lea($tmp$$Register, $mem$$Address);
 4139     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4140     if (elem_bt == T_SHORT) {
 4141       __ movl($mask_idx$$Register, 0x55555555);
 4142       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4143     }
 4144     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4145     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4146   %}
 4147   ins_pipe( pipe_slow );
 4148 %}
 4149 
 4150 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegP tmp, rRegP idx_base_temp,
 4151                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4152   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4153   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4154   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4155   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4156   ins_encode %{
 4157     int vlen_enc = vector_length_encoding(this);
 4158     int vector_len = Matcher::vector_length(this);
 4159     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4160     __ lea($tmp$$Register, $mem$$Address);
 4161     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4162     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4163     if (elem_bt == T_SHORT) {
 4164       __ movl($mask_idx$$Register, 0x55555555);
 4165       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4166     }
 4167     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4168     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4169                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4170   %}
 4171   ins_pipe( pipe_slow );
 4172 %}
 4173 
 4174 // ====================Scatter=======================================
 4175 
 4176 // Scatter INT, LONG, FLOAT, DOUBLE
 4177 
 4178 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4179   predicate(UseAVX > 2);
 4180   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4181   effect(TEMP tmp, TEMP ktmp);
 4182   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4183   ins_encode %{
 4184     int vlen_enc = vector_length_encoding(this, $src);
 4185     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4186 
 4187     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4188     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4189 
 4190     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4191     __ lea($tmp$$Register, $mem$$Address);
 4192     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4193   %}
 4194   ins_pipe( pipe_slow );
 4195 %}
 4196 
 4197 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4198   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4199   effect(TEMP tmp, TEMP ktmp);
 4200   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4201   ins_encode %{
 4202     int vlen_enc = vector_length_encoding(this, $src);
 4203     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4204     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4205     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4206     // Note: Since scatter instruction partially updates the opmask register used
 4207     // for predication hense moving mask operand to a temporary.
 4208     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4209     __ lea($tmp$$Register, $mem$$Address);
 4210     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4211   %}
 4212   ins_pipe( pipe_slow );
 4213 %}
 4214 
 4215 // ====================REPLICATE=======================================
 4216 
 4217 // Replicate byte scalar to be vector
 4218 instruct vReplB_reg(vec dst, rRegI src) %{
 4219   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4220   match(Set dst (Replicate src));
 4221   format %{ "replicateB $dst,$src" %}
 4222   ins_encode %{
 4223     uint vlen = Matcher::vector_length(this);
 4224     if (UseAVX >= 2) {
 4225       int vlen_enc = vector_length_encoding(this);
 4226       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4227         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4228         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4229       } else {
 4230         __ movdl($dst$$XMMRegister, $src$$Register);
 4231         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4232       }
 4233     } else {
 4234        assert(UseAVX < 2, "");
 4235       __ movdl($dst$$XMMRegister, $src$$Register);
 4236       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4237       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4238       if (vlen >= 16) {
 4239         assert(vlen == 16, "");
 4240         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4241       }
 4242     }
 4243   %}
 4244   ins_pipe( pipe_slow );
 4245 %}
 4246 
 4247 instruct ReplB_mem(vec dst, memory mem) %{
 4248   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4249   match(Set dst (Replicate (LoadB mem)));
 4250   format %{ "replicateB $dst,$mem" %}
 4251   ins_encode %{
 4252     int vlen_enc = vector_length_encoding(this);
 4253     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4254   %}
 4255   ins_pipe( pipe_slow );
 4256 %}
 4257 
 4258 // ====================ReplicateS=======================================
 4259 
 4260 instruct vReplS_reg(vec dst, rRegI src) %{
 4261   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4262   match(Set dst (Replicate src));
 4263   format %{ "replicateS $dst,$src" %}
 4264   ins_encode %{
 4265     uint vlen = Matcher::vector_length(this);
 4266     int vlen_enc = vector_length_encoding(this);
 4267     if (UseAVX >= 2) {
 4268       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4269         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4270         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4271       } else {
 4272         __ movdl($dst$$XMMRegister, $src$$Register);
 4273         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4274       }
 4275     } else {
 4276       assert(UseAVX < 2, "");
 4277       __ movdl($dst$$XMMRegister, $src$$Register);
 4278       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4279       if (vlen >= 8) {
 4280         assert(vlen == 8, "");
 4281         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4282       }
 4283     }
 4284   %}
 4285   ins_pipe( pipe_slow );
 4286 %}
 4287 
 4288 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4289   match(Set dst (Replicate con));
 4290   effect(TEMP rtmp);
 4291   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4292   ins_encode %{
 4293     int vlen_enc = vector_length_encoding(this);
 4294     BasicType bt = Matcher::vector_element_basic_type(this);
 4295     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4296     __ movl($rtmp$$Register, $con$$constant);
 4297     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4298   %}
 4299   ins_pipe( pipe_slow );
 4300 %}
 4301 
 4302 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4303   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4304   match(Set dst (Replicate src));
 4305   effect(TEMP rtmp);
 4306   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4307   ins_encode %{
 4308     int vlen_enc = vector_length_encoding(this);
 4309     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4310     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4311   %}
 4312   ins_pipe( pipe_slow );
 4313 %}
 4314 
 4315 instruct ReplS_mem(vec dst, memory mem) %{
 4316   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4317   match(Set dst (Replicate (LoadS mem)));
 4318   format %{ "replicateS $dst,$mem" %}
 4319   ins_encode %{
 4320     int vlen_enc = vector_length_encoding(this);
 4321     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4322   %}
 4323   ins_pipe( pipe_slow );
 4324 %}
 4325 
 4326 // ====================ReplicateI=======================================
 4327 
 4328 instruct ReplI_reg(vec dst, rRegI src) %{
 4329   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4330   match(Set dst (Replicate src));
 4331   format %{ "replicateI $dst,$src" %}
 4332   ins_encode %{
 4333     uint vlen = Matcher::vector_length(this);
 4334     int vlen_enc = vector_length_encoding(this);
 4335     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4336       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4337     } else if (VM_Version::supports_avx2()) {
 4338       __ movdl($dst$$XMMRegister, $src$$Register);
 4339       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4340     } else {
 4341       __ movdl($dst$$XMMRegister, $src$$Register);
 4342       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4343     }
 4344   %}
 4345   ins_pipe( pipe_slow );
 4346 %}
 4347 
 4348 instruct ReplI_mem(vec dst, memory mem) %{
 4349   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4350   match(Set dst (Replicate (LoadI mem)));
 4351   format %{ "replicateI $dst,$mem" %}
 4352   ins_encode %{
 4353     int vlen_enc = vector_length_encoding(this);
 4354     if (VM_Version::supports_avx2()) {
 4355       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4356     } else if (VM_Version::supports_avx()) {
 4357       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4358     } else {
 4359       __ movdl($dst$$XMMRegister, $mem$$Address);
 4360       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4361     }
 4362   %}
 4363   ins_pipe( pipe_slow );
 4364 %}
 4365 
 4366 instruct ReplI_imm(vec dst, immI con) %{
 4367   predicate(Matcher::is_non_long_integral_vector(n));
 4368   match(Set dst (Replicate con));
 4369   format %{ "replicateI $dst,$con" %}
 4370   ins_encode %{
 4371     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4372                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4373                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4374     BasicType bt = Matcher::vector_element_basic_type(this);
 4375     int vlen = Matcher::vector_length_in_bytes(this);
 4376     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4377   %}
 4378   ins_pipe( pipe_slow );
 4379 %}
 4380 
 4381 // Replicate scalar zero to be vector
 4382 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4383   predicate(Matcher::is_non_long_integral_vector(n));
 4384   match(Set dst (Replicate zero));
 4385   format %{ "replicateI $dst,$zero" %}
 4386   ins_encode %{
 4387     int vlen_enc = vector_length_encoding(this);
 4388     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4389       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4390     } else {
 4391       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4392     }
 4393   %}
 4394   ins_pipe( fpu_reg_reg );
 4395 %}
 4396 
 4397 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4398   predicate(Matcher::is_non_long_integral_vector(n));
 4399   match(Set dst (Replicate con));
 4400   format %{ "vallones $dst" %}
 4401   ins_encode %{
 4402     int vector_len = vector_length_encoding(this);
 4403     __ vallones($dst$$XMMRegister, vector_len);
 4404   %}
 4405   ins_pipe( pipe_slow );
 4406 %}
 4407 
 4408 // ====================ReplicateL=======================================
 4409 
 4410 // Replicate long (8 byte) scalar to be vector
 4411 instruct ReplL_reg(vec dst, rRegL src) %{
 4412   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4413   match(Set dst (Replicate src));
 4414   format %{ "replicateL $dst,$src" %}
 4415   ins_encode %{
 4416     int vlen = Matcher::vector_length(this);
 4417     int vlen_enc = vector_length_encoding(this);
 4418     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4419       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4420     } else if (VM_Version::supports_avx2()) {
 4421       __ movdq($dst$$XMMRegister, $src$$Register);
 4422       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4423     } else {
 4424       __ movdq($dst$$XMMRegister, $src$$Register);
 4425       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4426     }
 4427   %}
 4428   ins_pipe( pipe_slow );
 4429 %}
 4430 
 4431 instruct ReplL_mem(vec dst, memory mem) %{
 4432   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4433   match(Set dst (Replicate (LoadL mem)));
 4434   format %{ "replicateL $dst,$mem" %}
 4435   ins_encode %{
 4436     int vlen_enc = vector_length_encoding(this);
 4437     if (VM_Version::supports_avx2()) {
 4438       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4439     } else if (VM_Version::supports_sse3()) {
 4440       __ movddup($dst$$XMMRegister, $mem$$Address);
 4441     } else {
 4442       __ movq($dst$$XMMRegister, $mem$$Address);
 4443       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4444     }
 4445   %}
 4446   ins_pipe( pipe_slow );
 4447 %}
 4448 
 4449 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4450 instruct ReplL_imm(vec dst, immL con) %{
 4451   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4452   match(Set dst (Replicate con));
 4453   format %{ "replicateL $dst,$con" %}
 4454   ins_encode %{
 4455     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4456     int vlen = Matcher::vector_length_in_bytes(this);
 4457     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4458   %}
 4459   ins_pipe( pipe_slow );
 4460 %}
 4461 
 4462 instruct ReplL_zero(vec dst, immL0 zero) %{
 4463   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4464   match(Set dst (Replicate zero));
 4465   format %{ "replicateL $dst,$zero" %}
 4466   ins_encode %{
 4467     int vlen_enc = vector_length_encoding(this);
 4468     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4469       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4470     } else {
 4471       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4472     }
 4473   %}
 4474   ins_pipe( fpu_reg_reg );
 4475 %}
 4476 
 4477 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4478   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4479   match(Set dst (Replicate con));
 4480   format %{ "vallones $dst" %}
 4481   ins_encode %{
 4482     int vector_len = vector_length_encoding(this);
 4483     __ vallones($dst$$XMMRegister, vector_len);
 4484   %}
 4485   ins_pipe( pipe_slow );
 4486 %}
 4487 
 4488 // ====================ReplicateF=======================================
 4489 
 4490 instruct vReplF_reg(vec dst, vlRegF src) %{
 4491   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4492   match(Set dst (Replicate src));
 4493   format %{ "replicateF $dst,$src" %}
 4494   ins_encode %{
 4495     uint vlen = Matcher::vector_length(this);
 4496     int vlen_enc = vector_length_encoding(this);
 4497     if (vlen <= 4) {
 4498       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4499     } else if (VM_Version::supports_avx2()) {
 4500       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4501     } else {
 4502       assert(vlen == 8, "sanity");
 4503       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4504       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4505     }
 4506   %}
 4507   ins_pipe( pipe_slow );
 4508 %}
 4509 
 4510 instruct ReplF_reg(vec dst, vlRegF src) %{
 4511   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4512   match(Set dst (Replicate src));
 4513   format %{ "replicateF $dst,$src" %}
 4514   ins_encode %{
 4515     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4516   %}
 4517   ins_pipe( pipe_slow );
 4518 %}
 4519 
 4520 instruct ReplF_mem(vec dst, memory mem) %{
 4521   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4522   match(Set dst (Replicate (LoadF mem)));
 4523   format %{ "replicateF $dst,$mem" %}
 4524   ins_encode %{
 4525     int vlen_enc = vector_length_encoding(this);
 4526     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4527   %}
 4528   ins_pipe( pipe_slow );
 4529 %}
 4530 
 4531 // Replicate float scalar immediate to be vector by loading from const table.
 4532 instruct ReplF_imm(vec dst, immF con) %{
 4533   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4534   match(Set dst (Replicate con));
 4535   format %{ "replicateF $dst,$con" %}
 4536   ins_encode %{
 4537     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4538                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4539     int vlen = Matcher::vector_length_in_bytes(this);
 4540     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4541   %}
 4542   ins_pipe( pipe_slow );
 4543 %}
 4544 
 4545 instruct ReplF_zero(vec dst, immF0 zero) %{
 4546   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4547   match(Set dst (Replicate zero));
 4548   format %{ "replicateF $dst,$zero" %}
 4549   ins_encode %{
 4550     int vlen_enc = vector_length_encoding(this);
 4551     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4552       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4553     } else {
 4554       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4555     }
 4556   %}
 4557   ins_pipe( fpu_reg_reg );
 4558 %}
 4559 
 4560 // ====================ReplicateD=======================================
 4561 
 4562 // Replicate double (8 bytes) scalar to be vector
 4563 instruct vReplD_reg(vec dst, vlRegD src) %{
 4564   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4565   match(Set dst (Replicate src));
 4566   format %{ "replicateD $dst,$src" %}
 4567   ins_encode %{
 4568     uint vlen = Matcher::vector_length(this);
 4569     int vlen_enc = vector_length_encoding(this);
 4570     if (vlen <= 2) {
 4571       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4572     } else if (VM_Version::supports_avx2()) {
 4573       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4574     } else {
 4575       assert(vlen == 4, "sanity");
 4576       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4577       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4578     }
 4579   %}
 4580   ins_pipe( pipe_slow );
 4581 %}
 4582 
 4583 instruct ReplD_reg(vec dst, vlRegD src) %{
 4584   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4585   match(Set dst (Replicate src));
 4586   format %{ "replicateD $dst,$src" %}
 4587   ins_encode %{
 4588     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4589   %}
 4590   ins_pipe( pipe_slow );
 4591 %}
 4592 
 4593 instruct ReplD_mem(vec dst, memory mem) %{
 4594   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4595   match(Set dst (Replicate (LoadD mem)));
 4596   format %{ "replicateD $dst,$mem" %}
 4597   ins_encode %{
 4598     if (Matcher::vector_length(this) >= 4) {
 4599       int vlen_enc = vector_length_encoding(this);
 4600       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4601     } else {
 4602       __ movddup($dst$$XMMRegister, $mem$$Address);
 4603     }
 4604   %}
 4605   ins_pipe( pipe_slow );
 4606 %}
 4607 
 4608 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4609 instruct ReplD_imm(vec dst, immD con) %{
 4610   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4611   match(Set dst (Replicate con));
 4612   format %{ "replicateD $dst,$con" %}
 4613   ins_encode %{
 4614     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4615     int vlen = Matcher::vector_length_in_bytes(this);
 4616     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4617   %}
 4618   ins_pipe( pipe_slow );
 4619 %}
 4620 
 4621 instruct ReplD_zero(vec dst, immD0 zero) %{
 4622   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4623   match(Set dst (Replicate zero));
 4624   format %{ "replicateD $dst,$zero" %}
 4625   ins_encode %{
 4626     int vlen_enc = vector_length_encoding(this);
 4627     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4628       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4629     } else {
 4630       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4631     }
 4632   %}
 4633   ins_pipe( fpu_reg_reg );
 4634 %}
 4635 
 4636 // ====================VECTOR INSERT=======================================
 4637 
 4638 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4639   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4640   match(Set dst (VectorInsert (Binary dst val) idx));
 4641   format %{ "vector_insert $dst,$val,$idx" %}
 4642   ins_encode %{
 4643     assert(UseSSE >= 4, "required");
 4644     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4645 
 4646     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4647 
 4648     assert(is_integral_type(elem_bt), "");
 4649     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4650 
 4651     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4652   %}
 4653   ins_pipe( pipe_slow );
 4654 %}
 4655 
 4656 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4657   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4658   match(Set dst (VectorInsert (Binary src val) idx));
 4659   effect(TEMP vtmp);
 4660   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4661   ins_encode %{
 4662     int vlen_enc = Assembler::AVX_256bit;
 4663     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4664     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4665     int log2epr = log2(elem_per_lane);
 4666 
 4667     assert(is_integral_type(elem_bt), "sanity");
 4668     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4669 
 4670     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4671     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4672     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4673     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4674     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4675   %}
 4676   ins_pipe( pipe_slow );
 4677 %}
 4678 
 4679 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4680   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4681   match(Set dst (VectorInsert (Binary src val) idx));
 4682   effect(TEMP vtmp);
 4683   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4684   ins_encode %{
 4685     assert(UseAVX > 2, "sanity");
 4686 
 4687     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4688     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4689     int log2epr = log2(elem_per_lane);
 4690 
 4691     assert(is_integral_type(elem_bt), "");
 4692     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4693 
 4694     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4695     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4696     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4697     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4698     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4699   %}
 4700   ins_pipe( pipe_slow );
 4701 %}
 4702 
 4703 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4704   predicate(Matcher::vector_length(n) == 2);
 4705   match(Set dst (VectorInsert (Binary dst val) idx));
 4706   format %{ "vector_insert $dst,$val,$idx" %}
 4707   ins_encode %{
 4708     assert(UseSSE >= 4, "required");
 4709     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4710     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4711 
 4712     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4713   %}
 4714   ins_pipe( pipe_slow );
 4715 %}
 4716 
 4717 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4718   predicate(Matcher::vector_length(n) == 4);
 4719   match(Set dst (VectorInsert (Binary src val) idx));
 4720   effect(TEMP vtmp);
 4721   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4722   ins_encode %{
 4723     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4724     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4725 
 4726     uint x_idx = $idx$$constant & right_n_bits(1);
 4727     uint y_idx = ($idx$$constant >> 1) & 1;
 4728     int vlen_enc = Assembler::AVX_256bit;
 4729     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4730     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4731     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4732   %}
 4733   ins_pipe( pipe_slow );
 4734 %}
 4735 
 4736 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4737   predicate(Matcher::vector_length(n) == 8);
 4738   match(Set dst (VectorInsert (Binary src val) idx));
 4739   effect(TEMP vtmp);
 4740   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4741   ins_encode %{
 4742     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4743     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4744 
 4745     uint x_idx = $idx$$constant & right_n_bits(1);
 4746     uint y_idx = ($idx$$constant >> 1) & 3;
 4747     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4748     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4749     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4750   %}
 4751   ins_pipe( pipe_slow );
 4752 %}
 4753 
 4754 instruct insertF(vec dst, regF val, immU8 idx) %{
 4755   predicate(Matcher::vector_length(n) < 8);
 4756   match(Set dst (VectorInsert (Binary dst val) idx));
 4757   format %{ "vector_insert $dst,$val,$idx" %}
 4758   ins_encode %{
 4759     assert(UseSSE >= 4, "sanity");
 4760 
 4761     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4762     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4763 
 4764     uint x_idx = $idx$$constant & right_n_bits(2);
 4765     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4766   %}
 4767   ins_pipe( pipe_slow );
 4768 %}
 4769 
 4770 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4771   predicate(Matcher::vector_length(n) >= 8);
 4772   match(Set dst (VectorInsert (Binary src val) idx));
 4773   effect(TEMP vtmp);
 4774   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4775   ins_encode %{
 4776     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4777     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4778 
 4779     int vlen = Matcher::vector_length(this);
 4780     uint x_idx = $idx$$constant & right_n_bits(2);
 4781     if (vlen == 8) {
 4782       uint y_idx = ($idx$$constant >> 2) & 1;
 4783       int vlen_enc = Assembler::AVX_256bit;
 4784       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4785       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4786       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4787     } else {
 4788       assert(vlen == 16, "sanity");
 4789       uint y_idx = ($idx$$constant >> 2) & 3;
 4790       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4791       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4792       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4793     }
 4794   %}
 4795   ins_pipe( pipe_slow );
 4796 %}
 4797 
 4798 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4799   predicate(Matcher::vector_length(n) == 2);
 4800   match(Set dst (VectorInsert (Binary dst val) idx));
 4801   effect(TEMP tmp);
 4802   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4803   ins_encode %{
 4804     assert(UseSSE >= 4, "sanity");
 4805     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4806     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4807 
 4808     __ movq($tmp$$Register, $val$$XMMRegister);
 4809     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4810   %}
 4811   ins_pipe( pipe_slow );
 4812 %}
 4813 
 4814 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4815   predicate(Matcher::vector_length(n) == 4);
 4816   match(Set dst (VectorInsert (Binary src val) idx));
 4817   effect(TEMP vtmp, TEMP tmp);
 4818   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4819   ins_encode %{
 4820     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4821     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4822 
 4823     uint x_idx = $idx$$constant & right_n_bits(1);
 4824     uint y_idx = ($idx$$constant >> 1) & 1;
 4825     int vlen_enc = Assembler::AVX_256bit;
 4826     __ movq($tmp$$Register, $val$$XMMRegister);
 4827     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4828     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4829     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4830   %}
 4831   ins_pipe( pipe_slow );
 4832 %}
 4833 
 4834 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4835   predicate(Matcher::vector_length(n) == 8);
 4836   match(Set dst (VectorInsert (Binary src val) idx));
 4837   effect(TEMP tmp, TEMP vtmp);
 4838   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4839   ins_encode %{
 4840     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4841     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4842 
 4843     uint x_idx = $idx$$constant & right_n_bits(1);
 4844     uint y_idx = ($idx$$constant >> 1) & 3;
 4845     __ movq($tmp$$Register, $val$$XMMRegister);
 4846     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4847     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4848     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4849   %}
 4850   ins_pipe( pipe_slow );
 4851 %}
 4852 
 4853 // ====================REDUCTION ARITHMETIC=======================================
 4854 
 4855 // =======================Int Reduction==========================================
 4856 
 4857 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4858   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4859   match(Set dst (AddReductionVI src1 src2));
 4860   match(Set dst (MulReductionVI src1 src2));
 4861   match(Set dst (AndReductionV  src1 src2));
 4862   match(Set dst ( OrReductionV  src1 src2));
 4863   match(Set dst (XorReductionV  src1 src2));
 4864   match(Set dst (MinReductionV  src1 src2));
 4865   match(Set dst (MaxReductionV  src1 src2));
 4866   effect(TEMP vtmp1, TEMP vtmp2);
 4867   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4868   ins_encode %{
 4869     int opcode = this->ideal_Opcode();
 4870     int vlen = Matcher::vector_length(this, $src2);
 4871     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4872   %}
 4873   ins_pipe( pipe_slow );
 4874 %}
 4875 
 4876 // =======================Long Reduction==========================================
 4877 
 4878 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4879   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4880   match(Set dst (AddReductionVL src1 src2));
 4881   match(Set dst (MulReductionVL src1 src2));
 4882   match(Set dst (AndReductionV  src1 src2));
 4883   match(Set dst ( OrReductionV  src1 src2));
 4884   match(Set dst (XorReductionV  src1 src2));
 4885   match(Set dst (MinReductionV  src1 src2));
 4886   match(Set dst (MaxReductionV  src1 src2));
 4887   effect(TEMP vtmp1, TEMP vtmp2);
 4888   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4889   ins_encode %{
 4890     int opcode = this->ideal_Opcode();
 4891     int vlen = Matcher::vector_length(this, $src2);
 4892     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4893   %}
 4894   ins_pipe( pipe_slow );
 4895 %}
 4896 
 4897 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4898   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4899   match(Set dst (AddReductionVL src1 src2));
 4900   match(Set dst (MulReductionVL src1 src2));
 4901   match(Set dst (AndReductionV  src1 src2));
 4902   match(Set dst ( OrReductionV  src1 src2));
 4903   match(Set dst (XorReductionV  src1 src2));
 4904   match(Set dst (MinReductionV  src1 src2));
 4905   match(Set dst (MaxReductionV  src1 src2));
 4906   effect(TEMP vtmp1, TEMP vtmp2);
 4907   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4908   ins_encode %{
 4909     int opcode = this->ideal_Opcode();
 4910     int vlen = Matcher::vector_length(this, $src2);
 4911     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4912   %}
 4913   ins_pipe( pipe_slow );
 4914 %}
 4915 
 4916 // =======================Float Reduction==========================================
 4917 
 4918 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4919   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 4920   match(Set dst (AddReductionVF dst src));
 4921   match(Set dst (MulReductionVF dst src));
 4922   effect(TEMP dst, TEMP vtmp);
 4923   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4924   ins_encode %{
 4925     int opcode = this->ideal_Opcode();
 4926     int vlen = Matcher::vector_length(this, $src);
 4927     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4928   %}
 4929   ins_pipe( pipe_slow );
 4930 %}
 4931 
 4932 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4933   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 4934   match(Set dst (AddReductionVF dst src));
 4935   match(Set dst (MulReductionVF dst src));
 4936   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4937   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4938   ins_encode %{
 4939     int opcode = this->ideal_Opcode();
 4940     int vlen = Matcher::vector_length(this, $src);
 4941     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4942   %}
 4943   ins_pipe( pipe_slow );
 4944 %}
 4945 
 4946 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4947   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 4948   match(Set dst (AddReductionVF dst src));
 4949   match(Set dst (MulReductionVF dst src));
 4950   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4951   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4952   ins_encode %{
 4953     int opcode = this->ideal_Opcode();
 4954     int vlen = Matcher::vector_length(this, $src);
 4955     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4956   %}
 4957   ins_pipe( pipe_slow );
 4958 %}
 4959 
 4960 
 4961 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 4962   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4963   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4964   // src1 contains reduction identity
 4965   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 4966   match(Set dst (AddReductionVF src1 src2));
 4967   match(Set dst (MulReductionVF src1 src2));
 4968   effect(TEMP dst);
 4969   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 4970   ins_encode %{
 4971     int opcode = this->ideal_Opcode();
 4972     int vlen = Matcher::vector_length(this, $src2);
 4973     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 4974   %}
 4975   ins_pipe( pipe_slow );
 4976 %}
 4977 
 4978 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 4979   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4980   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4981   // src1 contains reduction identity
 4982   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 4983   match(Set dst (AddReductionVF src1 src2));
 4984   match(Set dst (MulReductionVF src1 src2));
 4985   effect(TEMP dst, TEMP vtmp);
 4986   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 4987   ins_encode %{
 4988     int opcode = this->ideal_Opcode();
 4989     int vlen = Matcher::vector_length(this, $src2);
 4990     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 4991   %}
 4992   ins_pipe( pipe_slow );
 4993 %}
 4994 
 4995 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 4996   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4997   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4998   // src1 contains reduction identity
 4999   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5000   match(Set dst (AddReductionVF src1 src2));
 5001   match(Set dst (MulReductionVF src1 src2));
 5002   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5003   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5004   ins_encode %{
 5005     int opcode = this->ideal_Opcode();
 5006     int vlen = Matcher::vector_length(this, $src2);
 5007     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5008   %}
 5009   ins_pipe( pipe_slow );
 5010 %}
 5011 
 5012 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5013   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 5014   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5015   // src1 contains reduction identity
 5016   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 5017   match(Set dst (AddReductionVF src1 src2));
 5018   match(Set dst (MulReductionVF src1 src2));
 5019   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5020   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5021   ins_encode %{
 5022     int opcode = this->ideal_Opcode();
 5023     int vlen = Matcher::vector_length(this, $src2);
 5024     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5025   %}
 5026   ins_pipe( pipe_slow );
 5027 %}
 5028 
 5029 // =======================Double Reduction==========================================
 5030 
 5031 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5032   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5033   match(Set dst (AddReductionVD dst src));
 5034   match(Set dst (MulReductionVD dst src));
 5035   effect(TEMP dst, TEMP vtmp);
 5036   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5037   ins_encode %{
 5038     int opcode = this->ideal_Opcode();
 5039     int vlen = Matcher::vector_length(this, $src);
 5040     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5041 %}
 5042   ins_pipe( pipe_slow );
 5043 %}
 5044 
 5045 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5046   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5047   match(Set dst (AddReductionVD dst src));
 5048   match(Set dst (MulReductionVD dst src));
 5049   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5050   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5051   ins_encode %{
 5052     int opcode = this->ideal_Opcode();
 5053     int vlen = Matcher::vector_length(this, $src);
 5054     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5055   %}
 5056   ins_pipe( pipe_slow );
 5057 %}
 5058 
 5059 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5060   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5061   match(Set dst (AddReductionVD dst src));
 5062   match(Set dst (MulReductionVD dst src));
 5063   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5064   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5065   ins_encode %{
 5066     int opcode = this->ideal_Opcode();
 5067     int vlen = Matcher::vector_length(this, $src);
 5068     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5069   %}
 5070   ins_pipe( pipe_slow );
 5071 %}
 5072 
 5073 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5074   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5075   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5076   // src1 contains reduction identity
 5077   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5078   match(Set dst (AddReductionVD src1 src2));
 5079   match(Set dst (MulReductionVD src1 src2));
 5080   effect(TEMP dst);
 5081   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5082   ins_encode %{
 5083     int opcode = this->ideal_Opcode();
 5084     int vlen = Matcher::vector_length(this, $src2);
 5085     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5086 %}
 5087   ins_pipe( pipe_slow );
 5088 %}
 5089 
 5090 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5091   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5092   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5093   // src1 contains reduction identity
 5094   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5095   match(Set dst (AddReductionVD src1 src2));
 5096   match(Set dst (MulReductionVD src1 src2));
 5097   effect(TEMP dst, TEMP vtmp);
 5098   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5099   ins_encode %{
 5100     int opcode = this->ideal_Opcode();
 5101     int vlen = Matcher::vector_length(this, $src2);
 5102     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5103   %}
 5104   ins_pipe( pipe_slow );
 5105 %}
 5106 
 5107 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5108   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5109   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5110   // src1 contains reduction identity
 5111   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5112   match(Set dst (AddReductionVD src1 src2));
 5113   match(Set dst (MulReductionVD src1 src2));
 5114   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5115   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5116   ins_encode %{
 5117     int opcode = this->ideal_Opcode();
 5118     int vlen = Matcher::vector_length(this, $src2);
 5119     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5120   %}
 5121   ins_pipe( pipe_slow );
 5122 %}
 5123 
 5124 // =======================Byte Reduction==========================================
 5125 
 5126 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5127   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5128   match(Set dst (AddReductionVI src1 src2));
 5129   match(Set dst (AndReductionV  src1 src2));
 5130   match(Set dst ( OrReductionV  src1 src2));
 5131   match(Set dst (XorReductionV  src1 src2));
 5132   match(Set dst (MinReductionV  src1 src2));
 5133   match(Set dst (MaxReductionV  src1 src2));
 5134   effect(TEMP vtmp1, TEMP vtmp2);
 5135   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5136   ins_encode %{
 5137     int opcode = this->ideal_Opcode();
 5138     int vlen = Matcher::vector_length(this, $src2);
 5139     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5140   %}
 5141   ins_pipe( pipe_slow );
 5142 %}
 5143 
 5144 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5145   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5146   match(Set dst (AddReductionVI src1 src2));
 5147   match(Set dst (AndReductionV  src1 src2));
 5148   match(Set dst ( OrReductionV  src1 src2));
 5149   match(Set dst (XorReductionV  src1 src2));
 5150   match(Set dst (MinReductionV  src1 src2));
 5151   match(Set dst (MaxReductionV  src1 src2));
 5152   effect(TEMP vtmp1, TEMP vtmp2);
 5153   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5154   ins_encode %{
 5155     int opcode = this->ideal_Opcode();
 5156     int vlen = Matcher::vector_length(this, $src2);
 5157     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5158   %}
 5159   ins_pipe( pipe_slow );
 5160 %}
 5161 
 5162 // =======================Short Reduction==========================================
 5163 
 5164 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5165   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5166   match(Set dst (AddReductionVI src1 src2));
 5167   match(Set dst (MulReductionVI src1 src2));
 5168   match(Set dst (AndReductionV  src1 src2));
 5169   match(Set dst ( OrReductionV  src1 src2));
 5170   match(Set dst (XorReductionV  src1 src2));
 5171   match(Set dst (MinReductionV  src1 src2));
 5172   match(Set dst (MaxReductionV  src1 src2));
 5173   effect(TEMP vtmp1, TEMP vtmp2);
 5174   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5175   ins_encode %{
 5176     int opcode = this->ideal_Opcode();
 5177     int vlen = Matcher::vector_length(this, $src2);
 5178     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5179   %}
 5180   ins_pipe( pipe_slow );
 5181 %}
 5182 
 5183 // =======================Mul Reduction==========================================
 5184 
 5185 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5186   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5187             Matcher::vector_length(n->in(2)) <= 32); // src2
 5188   match(Set dst (MulReductionVI src1 src2));
 5189   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5190   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5191   ins_encode %{
 5192     int opcode = this->ideal_Opcode();
 5193     int vlen = Matcher::vector_length(this, $src2);
 5194     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5195   %}
 5196   ins_pipe( pipe_slow );
 5197 %}
 5198 
 5199 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5200   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5201             Matcher::vector_length(n->in(2)) == 64); // src2
 5202   match(Set dst (MulReductionVI src1 src2));
 5203   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5204   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5205   ins_encode %{
 5206     int opcode = this->ideal_Opcode();
 5207     int vlen = Matcher::vector_length(this, $src2);
 5208     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5209   %}
 5210   ins_pipe( pipe_slow );
 5211 %}
 5212 
 5213 //--------------------Min/Max Float Reduction --------------------
 5214 // Float Min Reduction
 5215 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5216                             legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5217   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5218             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5219              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5220             Matcher::vector_length(n->in(2)) == 2);
 5221   match(Set dst (MinReductionV src1 src2));
 5222   match(Set dst (MaxReductionV src1 src2));
 5223   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5224   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5225   ins_encode %{
 5226     assert(UseAVX > 0, "sanity");
 5227 
 5228     int opcode = this->ideal_Opcode();
 5229     int vlen = Matcher::vector_length(this, $src2);
 5230     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5231                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5232   %}
 5233   ins_pipe( pipe_slow );
 5234 %}
 5235 
 5236 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5237                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5238   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5239             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5240              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5241             Matcher::vector_length(n->in(2)) >= 4);
 5242   match(Set dst (MinReductionV src1 src2));
 5243   match(Set dst (MaxReductionV src1 src2));
 5244   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5245   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5246   ins_encode %{
 5247     assert(UseAVX > 0, "sanity");
 5248 
 5249     int opcode = this->ideal_Opcode();
 5250     int vlen = Matcher::vector_length(this, $src2);
 5251     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5252                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5253   %}
 5254   ins_pipe( pipe_slow );
 5255 %}
 5256 
 5257 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp, legVec atmp,
 5258                                legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5259   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5260             Matcher::vector_length(n->in(2)) == 2);
 5261   match(Set dst (MinReductionV dst src));
 5262   match(Set dst (MaxReductionV dst src));
 5263   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5264   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5265   ins_encode %{
 5266     assert(UseAVX > 0, "sanity");
 5267 
 5268     int opcode = this->ideal_Opcode();
 5269     int vlen = Matcher::vector_length(this, $src);
 5270     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5271                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5272   %}
 5273   ins_pipe( pipe_slow );
 5274 %}
 5275 
 5276 
 5277 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp, legVec atmp, legVec btmp,
 5278                               legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5279   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5280             Matcher::vector_length(n->in(2)) >= 4);
 5281   match(Set dst (MinReductionV dst src));
 5282   match(Set dst (MaxReductionV dst src));
 5283   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5284   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5285   ins_encode %{
 5286     assert(UseAVX > 0, "sanity");
 5287 
 5288     int opcode = this->ideal_Opcode();
 5289     int vlen = Matcher::vector_length(this, $src);
 5290     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5291                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5292   %}
 5293   ins_pipe( pipe_slow );
 5294 %}
 5295 
 5296 instruct minmax_reduction2F_avx10(regF dst, immF src1, vec src2, vec xtmp1) %{
 5297   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5298             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5299              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5300             Matcher::vector_length(n->in(2)) == 2);
 5301   match(Set dst (MinReductionV src1 src2));
 5302   match(Set dst (MaxReductionV src1 src2));
 5303   effect(TEMP dst, TEMP xtmp1);
 5304   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 as TEMP" %}
 5305   ins_encode %{
 5306     int opcode = this->ideal_Opcode();
 5307     int vlen = Matcher::vector_length(this, $src2);
 5308     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5309                          xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5310   %}
 5311   ins_pipe( pipe_slow );
 5312 %}
 5313 
 5314 instruct minmax_reductionF_avx10(regF dst, immF src1, vec src2, vec xtmp1, vec xtmp2) %{
 5315   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5316             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5317              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5318             Matcher::vector_length(n->in(2)) >= 4);
 5319   match(Set dst (MinReductionV src1 src2));
 5320   match(Set dst (MaxReductionV src1 src2));
 5321   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5322   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5323   ins_encode %{
 5324     int opcode = this->ideal_Opcode();
 5325     int vlen = Matcher::vector_length(this, $src2);
 5326     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5327                          xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5328   %}
 5329   ins_pipe( pipe_slow );
 5330 %}
 5331 
 5332 instruct minmax_reduction2F_avx10_av(regF dst, vec src, vec xtmp1) %{
 5333   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5334             Matcher::vector_length(n->in(2)) == 2);
 5335   match(Set dst (MinReductionV dst src));
 5336   match(Set dst (MaxReductionV dst src));
 5337   effect(TEMP dst, TEMP xtmp1);
 5338   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 as TEMP" %}
 5339   ins_encode %{
 5340     int opcode = this->ideal_Opcode();
 5341     int vlen = Matcher::vector_length(this, $src);
 5342     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5343                          $xtmp1$$XMMRegister);
 5344   %}
 5345   ins_pipe( pipe_slow );
 5346 %}
 5347 
 5348 instruct minmax_reductionF_avx10_av(regF dst, vec src, vec xtmp1, vec xtmp2) %{
 5349   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5350             Matcher::vector_length(n->in(2)) >= 4);
 5351   match(Set dst (MinReductionV dst src));
 5352   match(Set dst (MaxReductionV dst src));
 5353   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5354   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5355   ins_encode %{
 5356     int opcode = this->ideal_Opcode();
 5357     int vlen = Matcher::vector_length(this, $src);
 5358     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5359                          $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5360   %}
 5361   ins_pipe( pipe_slow );
 5362 %}
 5363 
 5364 //--------------------Min Double Reduction --------------------
 5365 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5366                             legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5367   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5368             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5369              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5370             Matcher::vector_length(n->in(2)) == 2);
 5371   match(Set dst (MinReductionV src1 src2));
 5372   match(Set dst (MaxReductionV src1 src2));
 5373   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5374   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5375   ins_encode %{
 5376     assert(UseAVX > 0, "sanity");
 5377 
 5378     int opcode = this->ideal_Opcode();
 5379     int vlen = Matcher::vector_length(this, $src2);
 5380     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5381                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5382   %}
 5383   ins_pipe( pipe_slow );
 5384 %}
 5385 
 5386 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5387                            legVec tmp3, legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5388   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5389             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5390              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5391             Matcher::vector_length(n->in(2)) >= 4);
 5392   match(Set dst (MinReductionV src1 src2));
 5393   match(Set dst (MaxReductionV src1 src2));
 5394   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5395   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5396   ins_encode %{
 5397     assert(UseAVX > 0, "sanity");
 5398 
 5399     int opcode = this->ideal_Opcode();
 5400     int vlen = Matcher::vector_length(this, $src2);
 5401     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5402                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5403   %}
 5404   ins_pipe( pipe_slow );
 5405 %}
 5406 
 5407 
 5408 instruct minmax_reduction2D_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2,
 5409                                legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5410   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5411             Matcher::vector_length(n->in(2)) == 2);
 5412   match(Set dst (MinReductionV dst src));
 5413   match(Set dst (MaxReductionV dst src));
 5414   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5415   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5416   ins_encode %{
 5417     assert(UseAVX > 0, "sanity");
 5418 
 5419     int opcode = this->ideal_Opcode();
 5420     int vlen = Matcher::vector_length(this, $src);
 5421     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5422                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5423   %}
 5424   ins_pipe( pipe_slow );
 5425 %}
 5426 
 5427 instruct minmax_reductionD_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2, legVec tmp3,
 5428                               legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5429   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5430             Matcher::vector_length(n->in(2)) >= 4);
 5431   match(Set dst (MinReductionV dst src));
 5432   match(Set dst (MaxReductionV dst src));
 5433   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5434   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5435   ins_encode %{
 5436     assert(UseAVX > 0, "sanity");
 5437 
 5438     int opcode = this->ideal_Opcode();
 5439     int vlen = Matcher::vector_length(this, $src);
 5440     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5441                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5442   %}
 5443   ins_pipe( pipe_slow );
 5444 %}
 5445 
 5446 instruct minmax_reduction2D_avx10(regD dst, immD src1, vec src2, vec xtmp1) %{
 5447   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5448             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5449              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5450             Matcher::vector_length(n->in(2)) == 2);
 5451   match(Set dst (MinReductionV src1 src2));
 5452   match(Set dst (MaxReductionV src1 src2));
 5453   effect(TEMP dst, TEMP xtmp1);
 5454   format %{ "vector_minmax2D_reduction $dst, $src1, $src2 ; using $xtmp1 as TEMP" %}
 5455   ins_encode %{
 5456     int opcode = this->ideal_Opcode();
 5457     int vlen = Matcher::vector_length(this, $src2);
 5458     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg,
 5459                           xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5460   %}
 5461   ins_pipe( pipe_slow );
 5462 %}
 5463 
 5464 instruct minmax_reductionD_avx10(regD dst, immD src1, vec src2, vec xtmp1, vec xtmp2) %{
 5465   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5466             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5467              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5468             Matcher::vector_length(n->in(2)) >= 4);
 5469   match(Set dst (MinReductionV src1 src2));
 5470   match(Set dst (MaxReductionV src1 src2));
 5471   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5472   format %{ "vector_minmaxD_reduction $dst, $src1, $src2 ; using $xtmp1 and $xtmp2 as TEMP" %}
 5473   ins_encode %{
 5474     int opcode = this->ideal_Opcode();
 5475     int vlen = Matcher::vector_length(this, $src2);
 5476     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5477                           xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5478   %}
 5479   ins_pipe( pipe_slow );
 5480 %}
 5481 
 5482 
 5483 instruct minmax_reduction2D_av_avx10(regD dst, vec src, vec xtmp1) %{
 5484   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5485             Matcher::vector_length(n->in(2)) == 2);
 5486   match(Set dst (MinReductionV dst src));
 5487   match(Set dst (MaxReductionV dst src));
 5488   effect(TEMP dst, TEMP xtmp1);
 5489   format %{ "vector_minmax2D_reduction $dst, $src ; using $xtmp1 as TEMP" %}
 5490   ins_encode %{
 5491     int opcode = this->ideal_Opcode();
 5492     int vlen = Matcher::vector_length(this, $src);
 5493     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5494                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5495   %}
 5496   ins_pipe( pipe_slow );
 5497 %}
 5498 
 5499 instruct minmax_reductionD_av_avx10(regD dst, vec src, vec xtmp1, vec xtmp2) %{
 5500   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5501             Matcher::vector_length(n->in(2)) >= 4);
 5502   match(Set dst (MinReductionV dst src));
 5503   match(Set dst (MaxReductionV dst src));
 5504   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5505   format %{ "vector_minmaxD_reduction $dst, $src ; using $xtmp1 and $xtmp2 as TEMP" %}
 5506   ins_encode %{
 5507     int opcode = this->ideal_Opcode();
 5508     int vlen = Matcher::vector_length(this, $src);
 5509     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5510                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5511   %}
 5512   ins_pipe( pipe_slow );
 5513 %}
 5514 
 5515 // ====================VECTOR ARITHMETIC=======================================
 5516 
 5517 // --------------------------------- ADD --------------------------------------
 5518 
 5519 // Bytes vector add
 5520 instruct vaddB(vec dst, vec src) %{
 5521   predicate(UseAVX == 0);
 5522   match(Set dst (AddVB dst src));
 5523   format %{ "paddb   $dst,$src\t! add packedB" %}
 5524   ins_encode %{
 5525     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5526   %}
 5527   ins_pipe( pipe_slow );
 5528 %}
 5529 
 5530 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5531   predicate(UseAVX > 0);
 5532   match(Set dst (AddVB src1 src2));
 5533   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5534   ins_encode %{
 5535     int vlen_enc = vector_length_encoding(this);
 5536     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5537   %}
 5538   ins_pipe( pipe_slow );
 5539 %}
 5540 
 5541 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5542   predicate((UseAVX > 0) &&
 5543             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5544   match(Set dst (AddVB src (LoadVector mem)));
 5545   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5546   ins_encode %{
 5547     int vlen_enc = vector_length_encoding(this);
 5548     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5549   %}
 5550   ins_pipe( pipe_slow );
 5551 %}
 5552 
 5553 // Shorts/Chars vector add
 5554 instruct vaddS(vec dst, vec src) %{
 5555   predicate(UseAVX == 0);
 5556   match(Set dst (AddVS dst src));
 5557   format %{ "paddw   $dst,$src\t! add packedS" %}
 5558   ins_encode %{
 5559     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5560   %}
 5561   ins_pipe( pipe_slow );
 5562 %}
 5563 
 5564 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5565   predicate(UseAVX > 0);
 5566   match(Set dst (AddVS src1 src2));
 5567   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5568   ins_encode %{
 5569     int vlen_enc = vector_length_encoding(this);
 5570     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5571   %}
 5572   ins_pipe( pipe_slow );
 5573 %}
 5574 
 5575 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5576   predicate((UseAVX > 0) &&
 5577             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5578   match(Set dst (AddVS src (LoadVector mem)));
 5579   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5580   ins_encode %{
 5581     int vlen_enc = vector_length_encoding(this);
 5582     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5583   %}
 5584   ins_pipe( pipe_slow );
 5585 %}
 5586 
 5587 // Integers vector add
 5588 instruct vaddI(vec dst, vec src) %{
 5589   predicate(UseAVX == 0);
 5590   match(Set dst (AddVI dst src));
 5591   format %{ "paddd   $dst,$src\t! add packedI" %}
 5592   ins_encode %{
 5593     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5594   %}
 5595   ins_pipe( pipe_slow );
 5596 %}
 5597 
 5598 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5599   predicate(UseAVX > 0);
 5600   match(Set dst (AddVI src1 src2));
 5601   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5602   ins_encode %{
 5603     int vlen_enc = vector_length_encoding(this);
 5604     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5605   %}
 5606   ins_pipe( pipe_slow );
 5607 %}
 5608 
 5609 
 5610 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5611   predicate((UseAVX > 0) &&
 5612             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5613   match(Set dst (AddVI src (LoadVector mem)));
 5614   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5615   ins_encode %{
 5616     int vlen_enc = vector_length_encoding(this);
 5617     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5618   %}
 5619   ins_pipe( pipe_slow );
 5620 %}
 5621 
 5622 // Longs vector add
 5623 instruct vaddL(vec dst, vec src) %{
 5624   predicate(UseAVX == 0);
 5625   match(Set dst (AddVL dst src));
 5626   format %{ "paddq   $dst,$src\t! add packedL" %}
 5627   ins_encode %{
 5628     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5629   %}
 5630   ins_pipe( pipe_slow );
 5631 %}
 5632 
 5633 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5634   predicate(UseAVX > 0);
 5635   match(Set dst (AddVL src1 src2));
 5636   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5637   ins_encode %{
 5638     int vlen_enc = vector_length_encoding(this);
 5639     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5640   %}
 5641   ins_pipe( pipe_slow );
 5642 %}
 5643 
 5644 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5645   predicate((UseAVX > 0) &&
 5646             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5647   match(Set dst (AddVL src (LoadVector mem)));
 5648   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5649   ins_encode %{
 5650     int vlen_enc = vector_length_encoding(this);
 5651     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5652   %}
 5653   ins_pipe( pipe_slow );
 5654 %}
 5655 
 5656 // Floats vector add
 5657 instruct vaddF(vec dst, vec src) %{
 5658   predicate(UseAVX == 0);
 5659   match(Set dst (AddVF dst src));
 5660   format %{ "addps   $dst,$src\t! add packedF" %}
 5661   ins_encode %{
 5662     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5663   %}
 5664   ins_pipe( pipe_slow );
 5665 %}
 5666 
 5667 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5668   predicate(UseAVX > 0);
 5669   match(Set dst (AddVF src1 src2));
 5670   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5671   ins_encode %{
 5672     int vlen_enc = vector_length_encoding(this);
 5673     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5674   %}
 5675   ins_pipe( pipe_slow );
 5676 %}
 5677 
 5678 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5679   predicate((UseAVX > 0) &&
 5680             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5681   match(Set dst (AddVF src (LoadVector mem)));
 5682   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5683   ins_encode %{
 5684     int vlen_enc = vector_length_encoding(this);
 5685     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5686   %}
 5687   ins_pipe( pipe_slow );
 5688 %}
 5689 
 5690 // Doubles vector add
 5691 instruct vaddD(vec dst, vec src) %{
 5692   predicate(UseAVX == 0);
 5693   match(Set dst (AddVD dst src));
 5694   format %{ "addpd   $dst,$src\t! add packedD" %}
 5695   ins_encode %{
 5696     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5697   %}
 5698   ins_pipe( pipe_slow );
 5699 %}
 5700 
 5701 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5702   predicate(UseAVX > 0);
 5703   match(Set dst (AddVD src1 src2));
 5704   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5705   ins_encode %{
 5706     int vlen_enc = vector_length_encoding(this);
 5707     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5708   %}
 5709   ins_pipe( pipe_slow );
 5710 %}
 5711 
 5712 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5713   predicate((UseAVX > 0) &&
 5714             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5715   match(Set dst (AddVD src (LoadVector mem)));
 5716   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5717   ins_encode %{
 5718     int vlen_enc = vector_length_encoding(this);
 5719     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5720   %}
 5721   ins_pipe( pipe_slow );
 5722 %}
 5723 
 5724 // --------------------------------- SUB --------------------------------------
 5725 
 5726 // Bytes vector sub
 5727 instruct vsubB(vec dst, vec src) %{
 5728   predicate(UseAVX == 0);
 5729   match(Set dst (SubVB dst src));
 5730   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5731   ins_encode %{
 5732     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5733   %}
 5734   ins_pipe( pipe_slow );
 5735 %}
 5736 
 5737 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5738   predicate(UseAVX > 0);
 5739   match(Set dst (SubVB src1 src2));
 5740   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5741   ins_encode %{
 5742     int vlen_enc = vector_length_encoding(this);
 5743     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5744   %}
 5745   ins_pipe( pipe_slow );
 5746 %}
 5747 
 5748 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5749   predicate((UseAVX > 0) &&
 5750             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5751   match(Set dst (SubVB src (LoadVector mem)));
 5752   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5753   ins_encode %{
 5754     int vlen_enc = vector_length_encoding(this);
 5755     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5756   %}
 5757   ins_pipe( pipe_slow );
 5758 %}
 5759 
 5760 // Shorts/Chars vector sub
 5761 instruct vsubS(vec dst, vec src) %{
 5762   predicate(UseAVX == 0);
 5763   match(Set dst (SubVS dst src));
 5764   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5765   ins_encode %{
 5766     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5767   %}
 5768   ins_pipe( pipe_slow );
 5769 %}
 5770 
 5771 
 5772 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5773   predicate(UseAVX > 0);
 5774   match(Set dst (SubVS src1 src2));
 5775   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5776   ins_encode %{
 5777     int vlen_enc = vector_length_encoding(this);
 5778     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5779   %}
 5780   ins_pipe( pipe_slow );
 5781 %}
 5782 
 5783 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5784   predicate((UseAVX > 0) &&
 5785             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5786   match(Set dst (SubVS src (LoadVector mem)));
 5787   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5788   ins_encode %{
 5789     int vlen_enc = vector_length_encoding(this);
 5790     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5791   %}
 5792   ins_pipe( pipe_slow );
 5793 %}
 5794 
 5795 // Integers vector sub
 5796 instruct vsubI(vec dst, vec src) %{
 5797   predicate(UseAVX == 0);
 5798   match(Set dst (SubVI dst src));
 5799   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5800   ins_encode %{
 5801     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5802   %}
 5803   ins_pipe( pipe_slow );
 5804 %}
 5805 
 5806 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5807   predicate(UseAVX > 0);
 5808   match(Set dst (SubVI src1 src2));
 5809   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5810   ins_encode %{
 5811     int vlen_enc = vector_length_encoding(this);
 5812     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5813   %}
 5814   ins_pipe( pipe_slow );
 5815 %}
 5816 
 5817 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5818   predicate((UseAVX > 0) &&
 5819             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5820   match(Set dst (SubVI src (LoadVector mem)));
 5821   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5822   ins_encode %{
 5823     int vlen_enc = vector_length_encoding(this);
 5824     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5825   %}
 5826   ins_pipe( pipe_slow );
 5827 %}
 5828 
 5829 // Longs vector sub
 5830 instruct vsubL(vec dst, vec src) %{
 5831   predicate(UseAVX == 0);
 5832   match(Set dst (SubVL dst src));
 5833   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5834   ins_encode %{
 5835     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5836   %}
 5837   ins_pipe( pipe_slow );
 5838 %}
 5839 
 5840 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5841   predicate(UseAVX > 0);
 5842   match(Set dst (SubVL src1 src2));
 5843   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5844   ins_encode %{
 5845     int vlen_enc = vector_length_encoding(this);
 5846     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5847   %}
 5848   ins_pipe( pipe_slow );
 5849 %}
 5850 
 5851 
 5852 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5853   predicate((UseAVX > 0) &&
 5854             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5855   match(Set dst (SubVL src (LoadVector mem)));
 5856   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5857   ins_encode %{
 5858     int vlen_enc = vector_length_encoding(this);
 5859     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5860   %}
 5861   ins_pipe( pipe_slow );
 5862 %}
 5863 
 5864 // Floats vector sub
 5865 instruct vsubF(vec dst, vec src) %{
 5866   predicate(UseAVX == 0);
 5867   match(Set dst (SubVF dst src));
 5868   format %{ "subps   $dst,$src\t! sub packedF" %}
 5869   ins_encode %{
 5870     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5871   %}
 5872   ins_pipe( pipe_slow );
 5873 %}
 5874 
 5875 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5876   predicate(UseAVX > 0);
 5877   match(Set dst (SubVF src1 src2));
 5878   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5879   ins_encode %{
 5880     int vlen_enc = vector_length_encoding(this);
 5881     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5882   %}
 5883   ins_pipe( pipe_slow );
 5884 %}
 5885 
 5886 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5887   predicate((UseAVX > 0) &&
 5888             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5889   match(Set dst (SubVF src (LoadVector mem)));
 5890   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5891   ins_encode %{
 5892     int vlen_enc = vector_length_encoding(this);
 5893     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5894   %}
 5895   ins_pipe( pipe_slow );
 5896 %}
 5897 
 5898 // Doubles vector sub
 5899 instruct vsubD(vec dst, vec src) %{
 5900   predicate(UseAVX == 0);
 5901   match(Set dst (SubVD dst src));
 5902   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5903   ins_encode %{
 5904     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5905   %}
 5906   ins_pipe( pipe_slow );
 5907 %}
 5908 
 5909 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5910   predicate(UseAVX > 0);
 5911   match(Set dst (SubVD src1 src2));
 5912   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5913   ins_encode %{
 5914     int vlen_enc = vector_length_encoding(this);
 5915     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5916   %}
 5917   ins_pipe( pipe_slow );
 5918 %}
 5919 
 5920 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5921   predicate((UseAVX > 0) &&
 5922             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5923   match(Set dst (SubVD src (LoadVector mem)));
 5924   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5925   ins_encode %{
 5926     int vlen_enc = vector_length_encoding(this);
 5927     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5928   %}
 5929   ins_pipe( pipe_slow );
 5930 %}
 5931 
 5932 // --------------------------------- MUL --------------------------------------
 5933 
 5934 // Byte vector mul
 5935 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5936   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5937   match(Set dst (MulVB src1 src2));
 5938   effect(TEMP dst, TEMP xtmp);
 5939   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5940   ins_encode %{
 5941     assert(UseSSE > 3, "required");
 5942     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5943     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5944     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5945     __ psllw($dst$$XMMRegister, 8);
 5946     __ psrlw($dst$$XMMRegister, 8);
 5947     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5948   %}
 5949   ins_pipe( pipe_slow );
 5950 %}
 5951 
 5952 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5953   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5954   match(Set dst (MulVB src1 src2));
 5955   effect(TEMP dst, TEMP xtmp);
 5956   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5957   ins_encode %{
 5958     assert(UseSSE > 3, "required");
 5959     // Odd-index elements
 5960     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5961     __ psrlw($dst$$XMMRegister, 8);
 5962     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5963     __ psrlw($xtmp$$XMMRegister, 8);
 5964     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5965     __ psllw($dst$$XMMRegister, 8);
 5966     // Even-index elements
 5967     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5968     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5969     __ psllw($xtmp$$XMMRegister, 8);
 5970     __ psrlw($xtmp$$XMMRegister, 8);
 5971     // Combine
 5972     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5973   %}
 5974   ins_pipe( pipe_slow );
 5975 %}
 5976 
 5977 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5978   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5979   match(Set dst (MulVB src1 src2));
 5980   effect(TEMP xtmp1, TEMP xtmp2);
 5981   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5982   ins_encode %{
 5983     int vlen_enc = vector_length_encoding(this);
 5984     // Odd-index elements
 5985     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5986     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5987     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5988     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5989     // Even-index elements
 5990     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5991     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5992     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5993     // Combine
 5994     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5995   %}
 5996   ins_pipe( pipe_slow );
 5997 %}
 5998 
 5999 // Shorts/Chars vector mul
 6000 instruct vmulS(vec dst, vec src) %{
 6001   predicate(UseAVX == 0);
 6002   match(Set dst (MulVS dst src));
 6003   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 6004   ins_encode %{
 6005     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 6006   %}
 6007   ins_pipe( pipe_slow );
 6008 %}
 6009 
 6010 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 6011   predicate(UseAVX > 0);
 6012   match(Set dst (MulVS src1 src2));
 6013   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 6014   ins_encode %{
 6015     int vlen_enc = vector_length_encoding(this);
 6016     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6017   %}
 6018   ins_pipe( pipe_slow );
 6019 %}
 6020 
 6021 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 6022   predicate((UseAVX > 0) &&
 6023             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6024   match(Set dst (MulVS src (LoadVector mem)));
 6025   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 6026   ins_encode %{
 6027     int vlen_enc = vector_length_encoding(this);
 6028     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6029   %}
 6030   ins_pipe( pipe_slow );
 6031 %}
 6032 
 6033 // Integers vector mul
 6034 instruct vmulI(vec dst, vec src) %{
 6035   predicate(UseAVX == 0);
 6036   match(Set dst (MulVI dst src));
 6037   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6038   ins_encode %{
 6039     assert(UseSSE > 3, "required");
 6040     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6041   %}
 6042   ins_pipe( pipe_slow );
 6043 %}
 6044 
 6045 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6046   predicate(UseAVX > 0);
 6047   match(Set dst (MulVI src1 src2));
 6048   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6049   ins_encode %{
 6050     int vlen_enc = vector_length_encoding(this);
 6051     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6052   %}
 6053   ins_pipe( pipe_slow );
 6054 %}
 6055 
 6056 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6057   predicate((UseAVX > 0) &&
 6058             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6059   match(Set dst (MulVI src (LoadVector mem)));
 6060   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6061   ins_encode %{
 6062     int vlen_enc = vector_length_encoding(this);
 6063     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6064   %}
 6065   ins_pipe( pipe_slow );
 6066 %}
 6067 
 6068 // Longs vector mul
 6069 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6070   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6071              VM_Version::supports_avx512dq()) ||
 6072             VM_Version::supports_avx512vldq());
 6073   match(Set dst (MulVL src1 src2));
 6074   ins_cost(500);
 6075   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6076   ins_encode %{
 6077     assert(UseAVX > 2, "required");
 6078     int vlen_enc = vector_length_encoding(this);
 6079     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6080   %}
 6081   ins_pipe( pipe_slow );
 6082 %}
 6083 
 6084 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6085   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6086              VM_Version::supports_avx512dq()) ||
 6087             (Matcher::vector_length_in_bytes(n) > 8 &&
 6088              VM_Version::supports_avx512vldq()));
 6089   match(Set dst (MulVL src (LoadVector mem)));
 6090   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6091   ins_cost(500);
 6092   ins_encode %{
 6093     assert(UseAVX > 2, "required");
 6094     int vlen_enc = vector_length_encoding(this);
 6095     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6096   %}
 6097   ins_pipe( pipe_slow );
 6098 %}
 6099 
 6100 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6101   predicate(UseAVX == 0);
 6102   match(Set dst (MulVL src1 src2));
 6103   ins_cost(500);
 6104   effect(TEMP dst, TEMP xtmp);
 6105   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6106   ins_encode %{
 6107     assert(VM_Version::supports_sse4_1(), "required");
 6108     // Get the lo-hi products, only the lower 32 bits is in concerns
 6109     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6110     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6111     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6112     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6113     __ psllq($dst$$XMMRegister, 32);
 6114     // Get the lo-lo products
 6115     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6116     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6117     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6118   %}
 6119   ins_pipe( pipe_slow );
 6120 %}
 6121 
 6122 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6123   predicate(UseAVX > 0 &&
 6124             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6125               !VM_Version::supports_avx512dq()) ||
 6126              (Matcher::vector_length_in_bytes(n) < 64 &&
 6127               !VM_Version::supports_avx512vldq())));
 6128   match(Set dst (MulVL src1 src2));
 6129   effect(TEMP xtmp1, TEMP xtmp2);
 6130   ins_cost(500);
 6131   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6132   ins_encode %{
 6133     int vlen_enc = vector_length_encoding(this);
 6134     // Get the lo-hi products, only the lower 32 bits is in concerns
 6135     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6136     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6137     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6138     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6139     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6140     // Get the lo-lo products
 6141     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6142     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6143   %}
 6144   ins_pipe( pipe_slow );
 6145 %}
 6146 
 6147 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6148   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6149   match(Set dst (MulVL src1 src2));
 6150   ins_cost(100);
 6151   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6152   ins_encode %{
 6153     int vlen_enc = vector_length_encoding(this);
 6154     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6155   %}
 6156   ins_pipe( pipe_slow );
 6157 %}
 6158 
 6159 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6160   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6161   match(Set dst (MulVL src1 src2));
 6162   ins_cost(100);
 6163   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6164   ins_encode %{
 6165     int vlen_enc = vector_length_encoding(this);
 6166     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6167   %}
 6168   ins_pipe( pipe_slow );
 6169 %}
 6170 
 6171 // Floats vector mul
 6172 instruct vmulF(vec dst, vec src) %{
 6173   predicate(UseAVX == 0);
 6174   match(Set dst (MulVF dst src));
 6175   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6176   ins_encode %{
 6177     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6178   %}
 6179   ins_pipe( pipe_slow );
 6180 %}
 6181 
 6182 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6183   predicate(UseAVX > 0);
 6184   match(Set dst (MulVF src1 src2));
 6185   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6186   ins_encode %{
 6187     int vlen_enc = vector_length_encoding(this);
 6188     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6189   %}
 6190   ins_pipe( pipe_slow );
 6191 %}
 6192 
 6193 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6194   predicate((UseAVX > 0) &&
 6195             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6196   match(Set dst (MulVF src (LoadVector mem)));
 6197   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6198   ins_encode %{
 6199     int vlen_enc = vector_length_encoding(this);
 6200     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6201   %}
 6202   ins_pipe( pipe_slow );
 6203 %}
 6204 
 6205 // Doubles vector mul
 6206 instruct vmulD(vec dst, vec src) %{
 6207   predicate(UseAVX == 0);
 6208   match(Set dst (MulVD dst src));
 6209   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6210   ins_encode %{
 6211     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6212   %}
 6213   ins_pipe( pipe_slow );
 6214 %}
 6215 
 6216 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6217   predicate(UseAVX > 0);
 6218   match(Set dst (MulVD src1 src2));
 6219   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6220   ins_encode %{
 6221     int vlen_enc = vector_length_encoding(this);
 6222     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6223   %}
 6224   ins_pipe( pipe_slow );
 6225 %}
 6226 
 6227 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6228   predicate((UseAVX > 0) &&
 6229             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6230   match(Set dst (MulVD src (LoadVector mem)));
 6231   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6232   ins_encode %{
 6233     int vlen_enc = vector_length_encoding(this);
 6234     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6235   %}
 6236   ins_pipe( pipe_slow );
 6237 %}
 6238 
 6239 // --------------------------------- DIV --------------------------------------
 6240 
 6241 // Floats vector div
 6242 instruct vdivF(vec dst, vec src) %{
 6243   predicate(UseAVX == 0);
 6244   match(Set dst (DivVF dst src));
 6245   format %{ "divps   $dst,$src\t! div packedF" %}
 6246   ins_encode %{
 6247     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6248   %}
 6249   ins_pipe( pipe_slow );
 6250 %}
 6251 
 6252 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6253   predicate(UseAVX > 0);
 6254   match(Set dst (DivVF src1 src2));
 6255   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6256   ins_encode %{
 6257     int vlen_enc = vector_length_encoding(this);
 6258     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6259   %}
 6260   ins_pipe( pipe_slow );
 6261 %}
 6262 
 6263 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6264   predicate((UseAVX > 0) &&
 6265             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6266   match(Set dst (DivVF src (LoadVector mem)));
 6267   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6268   ins_encode %{
 6269     int vlen_enc = vector_length_encoding(this);
 6270     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6271   %}
 6272   ins_pipe( pipe_slow );
 6273 %}
 6274 
 6275 // Doubles vector div
 6276 instruct vdivD(vec dst, vec src) %{
 6277   predicate(UseAVX == 0);
 6278   match(Set dst (DivVD dst src));
 6279   format %{ "divpd   $dst,$src\t! div packedD" %}
 6280   ins_encode %{
 6281     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6282   %}
 6283   ins_pipe( pipe_slow );
 6284 %}
 6285 
 6286 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6287   predicate(UseAVX > 0);
 6288   match(Set dst (DivVD src1 src2));
 6289   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6290   ins_encode %{
 6291     int vlen_enc = vector_length_encoding(this);
 6292     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6293   %}
 6294   ins_pipe( pipe_slow );
 6295 %}
 6296 
 6297 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6298   predicate((UseAVX > 0) &&
 6299             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6300   match(Set dst (DivVD src (LoadVector mem)));
 6301   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6302   ins_encode %{
 6303     int vlen_enc = vector_length_encoding(this);
 6304     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6305   %}
 6306   ins_pipe( pipe_slow );
 6307 %}
 6308 
 6309 // ------------------------------ MinMax ---------------------------------------
 6310 
 6311 // Byte, Short, Int vector Min/Max
 6312 instruct minmax_reg_sse(vec dst, vec src) %{
 6313   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6314             UseAVX == 0);
 6315   match(Set dst (MinV dst src));
 6316   match(Set dst (MaxV dst src));
 6317   format %{ "vector_minmax  $dst,$src\t!  " %}
 6318   ins_encode %{
 6319     assert(UseSSE >= 4, "required");
 6320 
 6321     int opcode = this->ideal_Opcode();
 6322     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6323     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6324   %}
 6325   ins_pipe( pipe_slow );
 6326 %}
 6327 
 6328 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6329   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6330             UseAVX > 0);
 6331   match(Set dst (MinV src1 src2));
 6332   match(Set dst (MaxV src1 src2));
 6333   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6334   ins_encode %{
 6335     int opcode = this->ideal_Opcode();
 6336     int vlen_enc = vector_length_encoding(this);
 6337     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6338 
 6339     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6340   %}
 6341   ins_pipe( pipe_slow );
 6342 %}
 6343 
 6344 // Long vector Min/Max
 6345 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6346   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6347             UseAVX == 0);
 6348   match(Set dst (MinV dst src));
 6349   match(Set dst (MaxV src dst));
 6350   effect(TEMP dst, TEMP tmp);
 6351   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6352   ins_encode %{
 6353     assert(UseSSE >= 4, "required");
 6354 
 6355     int opcode = this->ideal_Opcode();
 6356     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6357     assert(elem_bt == T_LONG, "sanity");
 6358 
 6359     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6360   %}
 6361   ins_pipe( pipe_slow );
 6362 %}
 6363 
 6364 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6365   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6366             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6367   match(Set dst (MinV src1 src2));
 6368   match(Set dst (MaxV src1 src2));
 6369   effect(TEMP dst);
 6370   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6371   ins_encode %{
 6372     int vlen_enc = vector_length_encoding(this);
 6373     int opcode = this->ideal_Opcode();
 6374     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6375     assert(elem_bt == T_LONG, "sanity");
 6376 
 6377     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6378   %}
 6379   ins_pipe( pipe_slow );
 6380 %}
 6381 
 6382 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6383   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6384             Matcher::vector_element_basic_type(n) == T_LONG);
 6385   match(Set dst (MinV src1 src2));
 6386   match(Set dst (MaxV src1 src2));
 6387   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6388   ins_encode %{
 6389     assert(UseAVX > 2, "required");
 6390 
 6391     int vlen_enc = vector_length_encoding(this);
 6392     int opcode = this->ideal_Opcode();
 6393     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6394     assert(elem_bt == T_LONG, "sanity");
 6395 
 6396     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6397   %}
 6398   ins_pipe( pipe_slow );
 6399 %}
 6400 
 6401 // Float/Double vector Min/Max
 6402 instruct minmaxFP_avx10_reg(vec dst, vec a, vec b) %{
 6403   predicate(VM_Version::supports_avx10_2() &&
 6404             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6405   match(Set dst (MinV a b));
 6406   match(Set dst (MaxV a b));
 6407   format %{ "vector_minmaxFP  $dst, $a, $b" %}
 6408   ins_encode %{
 6409     int vlen_enc = vector_length_encoding(this);
 6410     int opcode = this->ideal_Opcode();
 6411     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6412     __ vminmax_fp(opcode, elem_bt, $dst$$XMMRegister, k0, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6413   %}
 6414   ins_pipe( pipe_slow );
 6415 %}
 6416 
 6417 // Float/Double vector Min/Max
 6418 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6419   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) <= 32 &&
 6420             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6421             UseAVX > 0);
 6422   match(Set dst (MinV a b));
 6423   match(Set dst (MaxV a b));
 6424   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6425   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6426   ins_encode %{
 6427     assert(UseAVX > 0, "required");
 6428 
 6429     int opcode = this->ideal_Opcode();
 6430     int vlen_enc = vector_length_encoding(this);
 6431     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6432 
 6433     __ vminmax_fp(opcode, elem_bt,
 6434                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6435                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6436   %}
 6437   ins_pipe( pipe_slow );
 6438 %}
 6439 
 6440 instruct evminmaxFP_reg_evex(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6441   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) == 64 &&
 6442             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6443   match(Set dst (MinV a b));
 6444   match(Set dst (MaxV a b));
 6445   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6446   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6447   ins_encode %{
 6448     assert(UseAVX > 2, "required");
 6449 
 6450     int opcode = this->ideal_Opcode();
 6451     int vlen_enc = vector_length_encoding(this);
 6452     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6453 
 6454     __ evminmax_fp(opcode, elem_bt,
 6455                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6456                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6457   %}
 6458   ins_pipe( pipe_slow );
 6459 %}
 6460 
 6461 // ------------------------------ Unsigned vector Min/Max ----------------------
 6462 
 6463 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6464   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6465   match(Set dst (UMinV a b));
 6466   match(Set dst (UMaxV a b));
 6467   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6468   ins_encode %{
 6469     int opcode = this->ideal_Opcode();
 6470     int vlen_enc = vector_length_encoding(this);
 6471     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6472     assert(is_integral_type(elem_bt), "");
 6473     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6474   %}
 6475   ins_pipe( pipe_slow );
 6476 %}
 6477 
 6478 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6479   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6480   match(Set dst (UMinV a (LoadVector b)));
 6481   match(Set dst (UMaxV a (LoadVector b)));
 6482   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6483   ins_encode %{
 6484     int opcode = this->ideal_Opcode();
 6485     int vlen_enc = vector_length_encoding(this);
 6486     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6487     assert(is_integral_type(elem_bt), "");
 6488     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6489   %}
 6490   ins_pipe( pipe_slow );
 6491 %}
 6492 
 6493 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6494   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6495   match(Set dst (UMinV a b));
 6496   match(Set dst (UMaxV a b));
 6497   effect(TEMP xtmp1, TEMP xtmp2);
 6498   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6499   ins_encode %{
 6500     int opcode = this->ideal_Opcode();
 6501     int vlen_enc = vector_length_encoding(this);
 6502     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6503   %}
 6504   ins_pipe( pipe_slow );
 6505 %}
 6506 
 6507 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6508   match(Set dst (UMinV (Binary dst src2) mask));
 6509   match(Set dst (UMaxV (Binary dst src2) mask));
 6510   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6511   ins_encode %{
 6512     int vlen_enc = vector_length_encoding(this);
 6513     BasicType bt = Matcher::vector_element_basic_type(this);
 6514     int opc = this->ideal_Opcode();
 6515     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6516                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6517   %}
 6518   ins_pipe( pipe_slow );
 6519 %}
 6520 
 6521 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6522   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6523   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6524   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6525   ins_encode %{
 6526     int vlen_enc = vector_length_encoding(this);
 6527     BasicType bt = Matcher::vector_element_basic_type(this);
 6528     int opc = this->ideal_Opcode();
 6529     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6530                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6531   %}
 6532   ins_pipe( pipe_slow );
 6533 %}
 6534 
 6535 // --------------------------------- Signum/CopySign ---------------------------
 6536 
 6537 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6538   match(Set dst (SignumF dst (Binary zero one)));
 6539   effect(KILL cr);
 6540   format %{ "signumF $dst, $dst" %}
 6541   ins_encode %{
 6542     int opcode = this->ideal_Opcode();
 6543     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6544   %}
 6545   ins_pipe( pipe_slow );
 6546 %}
 6547 
 6548 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6549   match(Set dst (SignumD dst (Binary zero one)));
 6550   effect(KILL cr);
 6551   format %{ "signumD $dst, $dst" %}
 6552   ins_encode %{
 6553     int opcode = this->ideal_Opcode();
 6554     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6555   %}
 6556   ins_pipe( pipe_slow );
 6557 %}
 6558 
 6559 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6560   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6561   match(Set dst (SignumVF src (Binary zero one)));
 6562   match(Set dst (SignumVD src (Binary zero one)));
 6563   effect(TEMP dst, TEMP xtmp1);
 6564   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6565   ins_encode %{
 6566     int opcode = this->ideal_Opcode();
 6567     int vec_enc = vector_length_encoding(this);
 6568     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6569                          $xtmp1$$XMMRegister, vec_enc);
 6570   %}
 6571   ins_pipe( pipe_slow );
 6572 %}
 6573 
 6574 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6575   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6576   match(Set dst (SignumVF src (Binary zero one)));
 6577   match(Set dst (SignumVD src (Binary zero one)));
 6578   effect(TEMP dst, TEMP ktmp1);
 6579   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6580   ins_encode %{
 6581     int opcode = this->ideal_Opcode();
 6582     int vec_enc = vector_length_encoding(this);
 6583     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6584                           $ktmp1$$KRegister, vec_enc);
 6585   %}
 6586   ins_pipe( pipe_slow );
 6587 %}
 6588 
 6589 // ---------------------------------------
 6590 // For copySign use 0xE4 as writemask for vpternlog
 6591 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6592 // C (xmm2) is set to 0x7FFFFFFF
 6593 // Wherever xmm2 is 0, we want to pick from B (sign)
 6594 // Wherever xmm2 is 1, we want to pick from A (src)
 6595 //
 6596 // A B C Result
 6597 // 0 0 0 0
 6598 // 0 0 1 0
 6599 // 0 1 0 1
 6600 // 0 1 1 0
 6601 // 1 0 0 0
 6602 // 1 0 1 1
 6603 // 1 1 0 1
 6604 // 1 1 1 1
 6605 //
 6606 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6607 // ---------------------------------------
 6608 
 6609 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6610   match(Set dst (CopySignF dst src));
 6611   effect(TEMP tmp1, TEMP tmp2);
 6612   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6613   ins_encode %{
 6614     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6615     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6616     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6617   %}
 6618   ins_pipe( pipe_slow );
 6619 %}
 6620 
 6621 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6622   match(Set dst (CopySignD dst (Binary src zero)));
 6623   ins_cost(100);
 6624   effect(TEMP tmp1, TEMP tmp2);
 6625   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6626   ins_encode %{
 6627     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6628     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6629     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6630   %}
 6631   ins_pipe( pipe_slow );
 6632 %}
 6633 
 6634 //----------------------------- CompressBits/ExpandBits ------------------------
 6635 
 6636 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6637   predicate(n->bottom_type()->isa_int());
 6638   match(Set dst (CompressBits src mask));
 6639   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6640   ins_encode %{
 6641     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6642   %}
 6643   ins_pipe( pipe_slow );
 6644 %}
 6645 
 6646 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6647   predicate(n->bottom_type()->isa_int());
 6648   match(Set dst (ExpandBits src mask));
 6649   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6650   ins_encode %{
 6651     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6652   %}
 6653   ins_pipe( pipe_slow );
 6654 %}
 6655 
 6656 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6657   predicate(n->bottom_type()->isa_int());
 6658   match(Set dst (CompressBits src (LoadI mask)));
 6659   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6660   ins_encode %{
 6661     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6662   %}
 6663   ins_pipe( pipe_slow );
 6664 %}
 6665 
 6666 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6667   predicate(n->bottom_type()->isa_int());
 6668   match(Set dst (ExpandBits src (LoadI mask)));
 6669   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6670   ins_encode %{
 6671     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6672   %}
 6673   ins_pipe( pipe_slow );
 6674 %}
 6675 
 6676 // --------------------------------- Sqrt --------------------------------------
 6677 
 6678 instruct vsqrtF_reg(vec dst, vec src) %{
 6679   match(Set dst (SqrtVF src));
 6680   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6681   ins_encode %{
 6682     assert(UseAVX > 0, "required");
 6683     int vlen_enc = vector_length_encoding(this);
 6684     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6685   %}
 6686   ins_pipe( pipe_slow );
 6687 %}
 6688 
 6689 instruct vsqrtF_mem(vec dst, memory mem) %{
 6690   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6691   match(Set dst (SqrtVF (LoadVector mem)));
 6692   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6693   ins_encode %{
 6694     assert(UseAVX > 0, "required");
 6695     int vlen_enc = vector_length_encoding(this);
 6696     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6697   %}
 6698   ins_pipe( pipe_slow );
 6699 %}
 6700 
 6701 // Floating point vector sqrt
 6702 instruct vsqrtD_reg(vec dst, vec src) %{
 6703   match(Set dst (SqrtVD src));
 6704   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6705   ins_encode %{
 6706     assert(UseAVX > 0, "required");
 6707     int vlen_enc = vector_length_encoding(this);
 6708     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6709   %}
 6710   ins_pipe( pipe_slow );
 6711 %}
 6712 
 6713 instruct vsqrtD_mem(vec dst, memory mem) %{
 6714   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6715   match(Set dst (SqrtVD (LoadVector mem)));
 6716   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6717   ins_encode %{
 6718     assert(UseAVX > 0, "required");
 6719     int vlen_enc = vector_length_encoding(this);
 6720     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6721   %}
 6722   ins_pipe( pipe_slow );
 6723 %}
 6724 
 6725 // ------------------------------ Shift ---------------------------------------
 6726 
 6727 // Left and right shift count vectors are the same on x86
 6728 // (only lowest bits of xmm reg are used for count).
 6729 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6730   match(Set dst (LShiftCntV cnt));
 6731   match(Set dst (RShiftCntV cnt));
 6732   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6733   ins_encode %{
 6734     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6735   %}
 6736   ins_pipe( pipe_slow );
 6737 %}
 6738 
 6739 // Byte vector shift
 6740 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6741   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6742   match(Set dst ( LShiftVB src shift));
 6743   match(Set dst ( RShiftVB src shift));
 6744   match(Set dst (URShiftVB src shift));
 6745   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6746   format %{"vector_byte_shift $dst,$src,$shift" %}
 6747   ins_encode %{
 6748     assert(UseSSE > 3, "required");
 6749     int opcode = this->ideal_Opcode();
 6750     bool sign = (opcode != Op_URShiftVB);
 6751     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6752     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6753     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6754     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6755     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6756   %}
 6757   ins_pipe( pipe_slow );
 6758 %}
 6759 
 6760 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6761   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6762             UseAVX <= 1);
 6763   match(Set dst ( LShiftVB src shift));
 6764   match(Set dst ( RShiftVB src shift));
 6765   match(Set dst (URShiftVB src shift));
 6766   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6767   format %{"vector_byte_shift $dst,$src,$shift" %}
 6768   ins_encode %{
 6769     assert(UseSSE > 3, "required");
 6770     int opcode = this->ideal_Opcode();
 6771     bool sign = (opcode != Op_URShiftVB);
 6772     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6773     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6774     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6775     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6776     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6777     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6778     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6779     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6780     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6781   %}
 6782   ins_pipe( pipe_slow );
 6783 %}
 6784 
 6785 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6786   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6787             UseAVX > 1);
 6788   match(Set dst ( LShiftVB src shift));
 6789   match(Set dst ( RShiftVB src shift));
 6790   match(Set dst (URShiftVB src shift));
 6791   effect(TEMP dst, TEMP tmp);
 6792   format %{"vector_byte_shift $dst,$src,$shift" %}
 6793   ins_encode %{
 6794     int opcode = this->ideal_Opcode();
 6795     bool sign = (opcode != Op_URShiftVB);
 6796     int vlen_enc = Assembler::AVX_256bit;
 6797     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6798     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6799     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6800     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6801     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6802   %}
 6803   ins_pipe( pipe_slow );
 6804 %}
 6805 
 6806 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6807   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6808   match(Set dst ( LShiftVB src shift));
 6809   match(Set dst ( RShiftVB src shift));
 6810   match(Set dst (URShiftVB src shift));
 6811   effect(TEMP dst, TEMP tmp);
 6812   format %{"vector_byte_shift $dst,$src,$shift" %}
 6813   ins_encode %{
 6814     assert(UseAVX > 1, "required");
 6815     int opcode = this->ideal_Opcode();
 6816     bool sign = (opcode != Op_URShiftVB);
 6817     int vlen_enc = Assembler::AVX_256bit;
 6818     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6819     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6820     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6821     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6822     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6823     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6824     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6825     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6826     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6827   %}
 6828   ins_pipe( pipe_slow );
 6829 %}
 6830 
 6831 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6832   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6833   match(Set dst ( LShiftVB src shift));
 6834   match(Set dst  (RShiftVB src shift));
 6835   match(Set dst (URShiftVB src shift));
 6836   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6837   format %{"vector_byte_shift $dst,$src,$shift" %}
 6838   ins_encode %{
 6839     assert(UseAVX > 2, "required");
 6840     int opcode = this->ideal_Opcode();
 6841     bool sign = (opcode != Op_URShiftVB);
 6842     int vlen_enc = Assembler::AVX_512bit;
 6843     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6844     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6845     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6846     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6847     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6848     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6849     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6850     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6851     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6852     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6853     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6854     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6855   %}
 6856   ins_pipe( pipe_slow );
 6857 %}
 6858 
 6859 // Shorts vector logical right shift produces incorrect Java result
 6860 // for negative data because java code convert short value into int with
 6861 // sign extension before a shift. But char vectors are fine since chars are
 6862 // unsigned values.
 6863 // Shorts/Chars vector left shift
 6864 instruct vshiftS(vec dst, vec src, vec shift) %{
 6865   predicate(!n->as_ShiftV()->is_var_shift());
 6866   match(Set dst ( LShiftVS src shift));
 6867   match(Set dst ( RShiftVS src shift));
 6868   match(Set dst (URShiftVS src shift));
 6869   effect(TEMP dst, USE src, USE shift);
 6870   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6871   ins_encode %{
 6872     int opcode = this->ideal_Opcode();
 6873     if (UseAVX > 0) {
 6874       int vlen_enc = vector_length_encoding(this);
 6875       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6876     } else {
 6877       int vlen = Matcher::vector_length(this);
 6878       if (vlen == 2) {
 6879         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6880         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6881       } else if (vlen == 4) {
 6882         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6883         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6884       } else {
 6885         assert (vlen == 8, "sanity");
 6886         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6887         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6888       }
 6889     }
 6890   %}
 6891   ins_pipe( pipe_slow );
 6892 %}
 6893 
 6894 // Integers vector left shift
 6895 instruct vshiftI(vec dst, vec src, vec shift) %{
 6896   predicate(!n->as_ShiftV()->is_var_shift());
 6897   match(Set dst ( LShiftVI src shift));
 6898   match(Set dst ( RShiftVI src shift));
 6899   match(Set dst (URShiftVI src shift));
 6900   effect(TEMP dst, USE src, USE shift);
 6901   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6902   ins_encode %{
 6903     int opcode = this->ideal_Opcode();
 6904     if (UseAVX > 0) {
 6905       int vlen_enc = vector_length_encoding(this);
 6906       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6907     } else {
 6908       int vlen = Matcher::vector_length(this);
 6909       if (vlen == 2) {
 6910         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6911         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6912       } else {
 6913         assert(vlen == 4, "sanity");
 6914         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6915         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6916       }
 6917     }
 6918   %}
 6919   ins_pipe( pipe_slow );
 6920 %}
 6921 
 6922 // Integers vector left constant shift
 6923 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6924   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6925   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6926   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6927   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6928   ins_encode %{
 6929     int opcode = this->ideal_Opcode();
 6930     if (UseAVX > 0) {
 6931       int vector_len = vector_length_encoding(this);
 6932       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6933     } else {
 6934       int vlen = Matcher::vector_length(this);
 6935       if (vlen == 2) {
 6936         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6937         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6938       } else {
 6939         assert(vlen == 4, "sanity");
 6940         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6941         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6942       }
 6943     }
 6944   %}
 6945   ins_pipe( pipe_slow );
 6946 %}
 6947 
 6948 // Longs vector shift
 6949 instruct vshiftL(vec dst, vec src, vec shift) %{
 6950   predicate(!n->as_ShiftV()->is_var_shift());
 6951   match(Set dst ( LShiftVL src shift));
 6952   match(Set dst (URShiftVL src shift));
 6953   effect(TEMP dst, USE src, USE shift);
 6954   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6955   ins_encode %{
 6956     int opcode = this->ideal_Opcode();
 6957     if (UseAVX > 0) {
 6958       int vlen_enc = vector_length_encoding(this);
 6959       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6960     } else {
 6961       assert(Matcher::vector_length(this) == 2, "");
 6962       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6963       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6964     }
 6965   %}
 6966   ins_pipe( pipe_slow );
 6967 %}
 6968 
 6969 // Longs vector constant shift
 6970 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6971   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6972   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6973   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6974   ins_encode %{
 6975     int opcode = this->ideal_Opcode();
 6976     if (UseAVX > 0) {
 6977       int vector_len = vector_length_encoding(this);
 6978       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6979     } else {
 6980       assert(Matcher::vector_length(this) == 2, "");
 6981       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6982       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6983     }
 6984   %}
 6985   ins_pipe( pipe_slow );
 6986 %}
 6987 
 6988 // -------------------ArithmeticRightShift -----------------------------------
 6989 // Long vector arithmetic right shift
 6990 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6991   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6992   match(Set dst (RShiftVL src shift));
 6993   effect(TEMP dst, TEMP tmp);
 6994   format %{ "vshiftq $dst,$src,$shift" %}
 6995   ins_encode %{
 6996     uint vlen = Matcher::vector_length(this);
 6997     if (vlen == 2) {
 6998       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6999       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 7000       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7001       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 7002       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 7003       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 7004     } else {
 7005       assert(vlen == 4, "sanity");
 7006       assert(UseAVX > 1, "required");
 7007       int vlen_enc = Assembler::AVX_256bit;
 7008       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7009       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 7010       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7011       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7012       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 7013     }
 7014   %}
 7015   ins_pipe( pipe_slow );
 7016 %}
 7017 
 7018 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 7019   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 7020   match(Set dst (RShiftVL src shift));
 7021   format %{ "vshiftq $dst,$src,$shift" %}
 7022   ins_encode %{
 7023     int vlen_enc = vector_length_encoding(this);
 7024     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7025   %}
 7026   ins_pipe( pipe_slow );
 7027 %}
 7028 
 7029 // ------------------- Variable Shift -----------------------------
 7030 // Byte variable shift
 7031 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7032   predicate(Matcher::vector_length(n) <= 8 &&
 7033             n->as_ShiftV()->is_var_shift() &&
 7034             !VM_Version::supports_avx512bw());
 7035   match(Set dst ( LShiftVB src shift));
 7036   match(Set dst ( RShiftVB src shift));
 7037   match(Set dst (URShiftVB src shift));
 7038   effect(TEMP dst, TEMP vtmp);
 7039   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7040   ins_encode %{
 7041     assert(UseAVX >= 2, "required");
 7042 
 7043     int opcode = this->ideal_Opcode();
 7044     int vlen_enc = Assembler::AVX_128bit;
 7045     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7046     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7047   %}
 7048   ins_pipe( pipe_slow );
 7049 %}
 7050 
 7051 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7052   predicate(Matcher::vector_length(n) == 16 &&
 7053             n->as_ShiftV()->is_var_shift() &&
 7054             !VM_Version::supports_avx512bw());
 7055   match(Set dst ( LShiftVB src shift));
 7056   match(Set dst ( RShiftVB src shift));
 7057   match(Set dst (URShiftVB src shift));
 7058   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7059   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7060   ins_encode %{
 7061     assert(UseAVX >= 2, "required");
 7062 
 7063     int opcode = this->ideal_Opcode();
 7064     int vlen_enc = Assembler::AVX_128bit;
 7065     // Shift lower half and get word result in dst
 7066     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7067 
 7068     // Shift upper half and get word result in vtmp1
 7069     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7070     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7071     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7072 
 7073     // Merge and down convert the two word results to byte in dst
 7074     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7075   %}
 7076   ins_pipe( pipe_slow );
 7077 %}
 7078 
 7079 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7080   predicate(Matcher::vector_length(n) == 32 &&
 7081             n->as_ShiftV()->is_var_shift() &&
 7082             !VM_Version::supports_avx512bw());
 7083   match(Set dst ( LShiftVB src shift));
 7084   match(Set dst ( RShiftVB src shift));
 7085   match(Set dst (URShiftVB src shift));
 7086   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7087   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7088   ins_encode %{
 7089     assert(UseAVX >= 2, "required");
 7090 
 7091     int opcode = this->ideal_Opcode();
 7092     int vlen_enc = Assembler::AVX_128bit;
 7093     // Process lower 128 bits and get result in dst
 7094     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7095     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7096     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7097     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7098     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7099 
 7100     // Process higher 128 bits and get result in vtmp3
 7101     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7102     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7103     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7104     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7105     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7106     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7107     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7108 
 7109     // Merge the two results in dst
 7110     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7111   %}
 7112   ins_pipe( pipe_slow );
 7113 %}
 7114 
 7115 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7116   predicate(Matcher::vector_length(n) <= 32 &&
 7117             n->as_ShiftV()->is_var_shift() &&
 7118             VM_Version::supports_avx512bw());
 7119   match(Set dst ( LShiftVB src shift));
 7120   match(Set dst ( RShiftVB src shift));
 7121   match(Set dst (URShiftVB src shift));
 7122   effect(TEMP dst, TEMP vtmp);
 7123   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7124   ins_encode %{
 7125     assert(UseAVX > 2, "required");
 7126 
 7127     int opcode = this->ideal_Opcode();
 7128     int vlen_enc = vector_length_encoding(this);
 7129     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7130   %}
 7131   ins_pipe( pipe_slow );
 7132 %}
 7133 
 7134 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7135   predicate(Matcher::vector_length(n) == 64 &&
 7136             n->as_ShiftV()->is_var_shift() &&
 7137             VM_Version::supports_avx512bw());
 7138   match(Set dst ( LShiftVB src shift));
 7139   match(Set dst ( RShiftVB src shift));
 7140   match(Set dst (URShiftVB src shift));
 7141   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7142   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7143   ins_encode %{
 7144     assert(UseAVX > 2, "required");
 7145 
 7146     int opcode = this->ideal_Opcode();
 7147     int vlen_enc = Assembler::AVX_256bit;
 7148     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7149     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7150     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7151     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7152     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7153   %}
 7154   ins_pipe( pipe_slow );
 7155 %}
 7156 
 7157 // Short variable shift
 7158 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7159   predicate(Matcher::vector_length(n) <= 8 &&
 7160             n->as_ShiftV()->is_var_shift() &&
 7161             !VM_Version::supports_avx512bw());
 7162   match(Set dst ( LShiftVS src shift));
 7163   match(Set dst ( RShiftVS src shift));
 7164   match(Set dst (URShiftVS src shift));
 7165   effect(TEMP dst, TEMP vtmp);
 7166   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7167   ins_encode %{
 7168     assert(UseAVX >= 2, "required");
 7169 
 7170     int opcode = this->ideal_Opcode();
 7171     bool sign = (opcode != Op_URShiftVS);
 7172     int vlen_enc = Assembler::AVX_256bit;
 7173     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7174     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7175     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7176     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7177     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7178     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7179   %}
 7180   ins_pipe( pipe_slow );
 7181 %}
 7182 
 7183 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7184   predicate(Matcher::vector_length(n) == 16 &&
 7185             n->as_ShiftV()->is_var_shift() &&
 7186             !VM_Version::supports_avx512bw());
 7187   match(Set dst ( LShiftVS src shift));
 7188   match(Set dst ( RShiftVS src shift));
 7189   match(Set dst (URShiftVS src shift));
 7190   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7191   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7192   ins_encode %{
 7193     assert(UseAVX >= 2, "required");
 7194 
 7195     int opcode = this->ideal_Opcode();
 7196     bool sign = (opcode != Op_URShiftVS);
 7197     int vlen_enc = Assembler::AVX_256bit;
 7198     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7199     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7200     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7201     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7202     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7203 
 7204     // Shift upper half, with result in dst using vtmp1 as TEMP
 7205     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7206     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7207     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7208     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7209     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7210     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7211 
 7212     // Merge lower and upper half result into dst
 7213     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7214     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7215   %}
 7216   ins_pipe( pipe_slow );
 7217 %}
 7218 
 7219 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7220   predicate(n->as_ShiftV()->is_var_shift() &&
 7221             VM_Version::supports_avx512bw());
 7222   match(Set dst ( LShiftVS src shift));
 7223   match(Set dst ( RShiftVS src shift));
 7224   match(Set dst (URShiftVS src shift));
 7225   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7226   ins_encode %{
 7227     assert(UseAVX > 2, "required");
 7228 
 7229     int opcode = this->ideal_Opcode();
 7230     int vlen_enc = vector_length_encoding(this);
 7231     if (!VM_Version::supports_avx512vl()) {
 7232       vlen_enc = Assembler::AVX_512bit;
 7233     }
 7234     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7235   %}
 7236   ins_pipe( pipe_slow );
 7237 %}
 7238 
 7239 //Integer variable shift
 7240 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7241   predicate(n->as_ShiftV()->is_var_shift());
 7242   match(Set dst ( LShiftVI src shift));
 7243   match(Set dst ( RShiftVI src shift));
 7244   match(Set dst (URShiftVI src shift));
 7245   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7246   ins_encode %{
 7247     assert(UseAVX >= 2, "required");
 7248 
 7249     int opcode = this->ideal_Opcode();
 7250     int vlen_enc = vector_length_encoding(this);
 7251     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7252   %}
 7253   ins_pipe( pipe_slow );
 7254 %}
 7255 
 7256 //Long variable shift
 7257 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7258   predicate(n->as_ShiftV()->is_var_shift());
 7259   match(Set dst ( LShiftVL src shift));
 7260   match(Set dst (URShiftVL src shift));
 7261   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7262   ins_encode %{
 7263     assert(UseAVX >= 2, "required");
 7264 
 7265     int opcode = this->ideal_Opcode();
 7266     int vlen_enc = vector_length_encoding(this);
 7267     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7268   %}
 7269   ins_pipe( pipe_slow );
 7270 %}
 7271 
 7272 //Long variable right shift arithmetic
 7273 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7274   predicate(Matcher::vector_length(n) <= 4 &&
 7275             n->as_ShiftV()->is_var_shift() &&
 7276             UseAVX == 2);
 7277   match(Set dst (RShiftVL src shift));
 7278   effect(TEMP dst, TEMP vtmp);
 7279   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7280   ins_encode %{
 7281     int opcode = this->ideal_Opcode();
 7282     int vlen_enc = vector_length_encoding(this);
 7283     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7284                  $vtmp$$XMMRegister);
 7285   %}
 7286   ins_pipe( pipe_slow );
 7287 %}
 7288 
 7289 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7290   predicate(n->as_ShiftV()->is_var_shift() &&
 7291             UseAVX > 2);
 7292   match(Set dst (RShiftVL src shift));
 7293   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7294   ins_encode %{
 7295     int opcode = this->ideal_Opcode();
 7296     int vlen_enc = vector_length_encoding(this);
 7297     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7298   %}
 7299   ins_pipe( pipe_slow );
 7300 %}
 7301 
 7302 // --------------------------------- AND --------------------------------------
 7303 
 7304 instruct vand(vec dst, vec src) %{
 7305   predicate(UseAVX == 0);
 7306   match(Set dst (AndV dst src));
 7307   format %{ "pand    $dst,$src\t! and vectors" %}
 7308   ins_encode %{
 7309     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7310   %}
 7311   ins_pipe( pipe_slow );
 7312 %}
 7313 
 7314 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7315   predicate(UseAVX > 0);
 7316   match(Set dst (AndV src1 src2));
 7317   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7318   ins_encode %{
 7319     int vlen_enc = vector_length_encoding(this);
 7320     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7321   %}
 7322   ins_pipe( pipe_slow );
 7323 %}
 7324 
 7325 instruct vand_mem(vec dst, vec src, memory mem) %{
 7326   predicate((UseAVX > 0) &&
 7327             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7328   match(Set dst (AndV src (LoadVector mem)));
 7329   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7330   ins_encode %{
 7331     int vlen_enc = vector_length_encoding(this);
 7332     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7333   %}
 7334   ins_pipe( pipe_slow );
 7335 %}
 7336 
 7337 // --------------------------------- OR ---------------------------------------
 7338 
 7339 instruct vor(vec dst, vec src) %{
 7340   predicate(UseAVX == 0);
 7341   match(Set dst (OrV dst src));
 7342   format %{ "por     $dst,$src\t! or vectors" %}
 7343   ins_encode %{
 7344     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7345   %}
 7346   ins_pipe( pipe_slow );
 7347 %}
 7348 
 7349 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7350   predicate(UseAVX > 0);
 7351   match(Set dst (OrV src1 src2));
 7352   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7353   ins_encode %{
 7354     int vlen_enc = vector_length_encoding(this);
 7355     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7356   %}
 7357   ins_pipe( pipe_slow );
 7358 %}
 7359 
 7360 instruct vor_mem(vec dst, vec src, memory mem) %{
 7361   predicate((UseAVX > 0) &&
 7362             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7363   match(Set dst (OrV src (LoadVector mem)));
 7364   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7365   ins_encode %{
 7366     int vlen_enc = vector_length_encoding(this);
 7367     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7368   %}
 7369   ins_pipe( pipe_slow );
 7370 %}
 7371 
 7372 // --------------------------------- XOR --------------------------------------
 7373 
 7374 instruct vxor(vec dst, vec src) %{
 7375   predicate(UseAVX == 0);
 7376   match(Set dst (XorV dst src));
 7377   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7378   ins_encode %{
 7379     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7380   %}
 7381   ins_pipe( pipe_slow );
 7382 %}
 7383 
 7384 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7385   predicate(UseAVX > 0);
 7386   match(Set dst (XorV src1 src2));
 7387   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7388   ins_encode %{
 7389     int vlen_enc = vector_length_encoding(this);
 7390     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7391   %}
 7392   ins_pipe( pipe_slow );
 7393 %}
 7394 
 7395 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7396   predicate((UseAVX > 0) &&
 7397             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7398   match(Set dst (XorV src (LoadVector mem)));
 7399   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7400   ins_encode %{
 7401     int vlen_enc = vector_length_encoding(this);
 7402     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7403   %}
 7404   ins_pipe( pipe_slow );
 7405 %}
 7406 
 7407 // --------------------------------- VectorCast --------------------------------------
 7408 
 7409 instruct vcastBtoX(vec dst, vec src) %{
 7410   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7411   match(Set dst (VectorCastB2X src));
 7412   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7413   ins_encode %{
 7414     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7415     int vlen_enc = vector_length_encoding(this);
 7416     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7417   %}
 7418   ins_pipe( pipe_slow );
 7419 %}
 7420 
 7421 instruct vcastBtoD(legVec dst, legVec src) %{
 7422   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7423   match(Set dst (VectorCastB2X src));
 7424   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7425   ins_encode %{
 7426     int vlen_enc = vector_length_encoding(this);
 7427     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7428   %}
 7429   ins_pipe( pipe_slow );
 7430 %}
 7431 
 7432 instruct castStoX(vec dst, vec src) %{
 7433   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7434             Matcher::vector_length(n->in(1)) <= 8 && // src
 7435             Matcher::vector_element_basic_type(n) == T_BYTE);
 7436   match(Set dst (VectorCastS2X src));
 7437   format %{ "vector_cast_s2x $dst,$src" %}
 7438   ins_encode %{
 7439     assert(UseAVX > 0, "required");
 7440 
 7441     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7442     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7443   %}
 7444   ins_pipe( pipe_slow );
 7445 %}
 7446 
 7447 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7448   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7449             Matcher::vector_length(n->in(1)) == 16 && // src
 7450             Matcher::vector_element_basic_type(n) == T_BYTE);
 7451   effect(TEMP dst, TEMP vtmp);
 7452   match(Set dst (VectorCastS2X src));
 7453   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7454   ins_encode %{
 7455     assert(UseAVX > 0, "required");
 7456 
 7457     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7458     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7459     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7460     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7461   %}
 7462   ins_pipe( pipe_slow );
 7463 %}
 7464 
 7465 instruct vcastStoX_evex(vec dst, vec src) %{
 7466   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7467             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7468   match(Set dst (VectorCastS2X src));
 7469   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7470   ins_encode %{
 7471     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7472     int src_vlen_enc = vector_length_encoding(this, $src);
 7473     int vlen_enc = vector_length_encoding(this);
 7474     switch (to_elem_bt) {
 7475       case T_BYTE:
 7476         if (!VM_Version::supports_avx512vl()) {
 7477           vlen_enc = Assembler::AVX_512bit;
 7478         }
 7479         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7480         break;
 7481       case T_INT:
 7482         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7483         break;
 7484       case T_FLOAT:
 7485         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7486         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7487         break;
 7488       case T_LONG:
 7489         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7490         break;
 7491       case T_DOUBLE: {
 7492         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7493         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7494         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7495         break;
 7496       }
 7497       default:
 7498         ShouldNotReachHere();
 7499     }
 7500   %}
 7501   ins_pipe( pipe_slow );
 7502 %}
 7503 
 7504 instruct castItoX(vec dst, vec src) %{
 7505   predicate(UseAVX <= 2 &&
 7506             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7507             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7508   match(Set dst (VectorCastI2X src));
 7509   format %{ "vector_cast_i2x $dst,$src" %}
 7510   ins_encode %{
 7511     assert(UseAVX > 0, "required");
 7512 
 7513     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7514     int vlen_enc = vector_length_encoding(this, $src);
 7515 
 7516     if (to_elem_bt == T_BYTE) {
 7517       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7518       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7519       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7520     } else {
 7521       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7522       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7523       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7524     }
 7525   %}
 7526   ins_pipe( pipe_slow );
 7527 %}
 7528 
 7529 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7530   predicate(UseAVX <= 2 &&
 7531             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7532             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7533   match(Set dst (VectorCastI2X src));
 7534   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7535   effect(TEMP dst, TEMP vtmp);
 7536   ins_encode %{
 7537     assert(UseAVX > 0, "required");
 7538 
 7539     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7540     int vlen_enc = vector_length_encoding(this, $src);
 7541 
 7542     if (to_elem_bt == T_BYTE) {
 7543       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7544       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7545       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7546       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7547     } else {
 7548       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7549       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7550       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7551       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7552     }
 7553   %}
 7554   ins_pipe( pipe_slow );
 7555 %}
 7556 
 7557 instruct vcastItoX_evex(vec dst, vec src) %{
 7558   predicate(UseAVX > 2 ||
 7559             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7560   match(Set dst (VectorCastI2X src));
 7561   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7562   ins_encode %{
 7563     assert(UseAVX > 0, "required");
 7564 
 7565     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7566     int src_vlen_enc = vector_length_encoding(this, $src);
 7567     int dst_vlen_enc = vector_length_encoding(this);
 7568     switch (dst_elem_bt) {
 7569       case T_BYTE:
 7570         if (!VM_Version::supports_avx512vl()) {
 7571           src_vlen_enc = Assembler::AVX_512bit;
 7572         }
 7573         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7574         break;
 7575       case T_SHORT:
 7576         if (!VM_Version::supports_avx512vl()) {
 7577           src_vlen_enc = Assembler::AVX_512bit;
 7578         }
 7579         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7580         break;
 7581       case T_FLOAT:
 7582         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7583         break;
 7584       case T_LONG:
 7585         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7586         break;
 7587       case T_DOUBLE:
 7588         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7589         break;
 7590       default:
 7591         ShouldNotReachHere();
 7592     }
 7593   %}
 7594   ins_pipe( pipe_slow );
 7595 %}
 7596 
 7597 instruct vcastLtoBS(vec dst, vec src) %{
 7598   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7599             UseAVX <= 2);
 7600   match(Set dst (VectorCastL2X src));
 7601   format %{ "vector_cast_l2x  $dst,$src" %}
 7602   ins_encode %{
 7603     assert(UseAVX > 0, "required");
 7604 
 7605     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7606     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7607     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7608                                                       : ExternalAddress(vector_int_to_short_mask());
 7609     if (vlen <= 16) {
 7610       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7611       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7612       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7613     } else {
 7614       assert(vlen <= 32, "required");
 7615       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7616       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7617       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7618       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7619     }
 7620     if (to_elem_bt == T_BYTE) {
 7621       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7622     }
 7623   %}
 7624   ins_pipe( pipe_slow );
 7625 %}
 7626 
 7627 instruct vcastLtoX_evex(vec dst, vec src) %{
 7628   predicate(UseAVX > 2 ||
 7629             (Matcher::vector_element_basic_type(n) == T_INT ||
 7630              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7631              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7632   match(Set dst (VectorCastL2X src));
 7633   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7634   ins_encode %{
 7635     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7636     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7637     int vlen_enc = vector_length_encoding(this, $src);
 7638     switch (to_elem_bt) {
 7639       case T_BYTE:
 7640         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7641           vlen_enc = Assembler::AVX_512bit;
 7642         }
 7643         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7644         break;
 7645       case T_SHORT:
 7646         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7647           vlen_enc = Assembler::AVX_512bit;
 7648         }
 7649         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7650         break;
 7651       case T_INT:
 7652         if (vlen == 8) {
 7653           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7654             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7655           }
 7656         } else if (vlen == 16) {
 7657           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7658         } else if (vlen == 32) {
 7659           if (UseAVX > 2) {
 7660             if (!VM_Version::supports_avx512vl()) {
 7661               vlen_enc = Assembler::AVX_512bit;
 7662             }
 7663             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7664           } else {
 7665             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7666             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7667           }
 7668         } else { // vlen == 64
 7669           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7670         }
 7671         break;
 7672       case T_FLOAT:
 7673         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7674         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7675         break;
 7676       case T_DOUBLE:
 7677         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7678         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7679         break;
 7680 
 7681       default: assert(false, "%s", type2name(to_elem_bt));
 7682     }
 7683   %}
 7684   ins_pipe( pipe_slow );
 7685 %}
 7686 
 7687 instruct vcastFtoD_reg(vec dst, vec src) %{
 7688   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7689   match(Set dst (VectorCastF2X src));
 7690   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7691   ins_encode %{
 7692     int vlen_enc = vector_length_encoding(this);
 7693     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7694   %}
 7695   ins_pipe( pipe_slow );
 7696 %}
 7697 
 7698 
 7699 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7700   predicate(!VM_Version::supports_avx10_2() &&
 7701             !VM_Version::supports_avx512vl() &&
 7702             Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7703             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4 &&
 7704             is_integral_type(Matcher::vector_element_basic_type(n)));
 7705   match(Set dst (VectorCastF2X src));
 7706   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7707   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7708   ins_encode %{
 7709     int vlen_enc = vector_length_encoding(this, $src);
 7710     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7711     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7712     // 32 bit addresses for register indirect addressing mode since stub constants
 7713     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7714     // However, targets are free to increase this limit, but having a large code cache size
 7715     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7716     // cap we save a temporary register allocation which in limiting case can prevent
 7717     // spilling in high register pressure blocks.
 7718     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7719                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7720                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7721   %}
 7722   ins_pipe( pipe_slow );
 7723 %}
 7724 
 7725 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7726   predicate(!VM_Version::supports_avx10_2() &&
 7727             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7728             is_integral_type(Matcher::vector_element_basic_type(n)));
 7729   match(Set dst (VectorCastF2X src));
 7730   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7731   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7732   ins_encode %{
 7733     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7734     if (to_elem_bt == T_LONG) {
 7735       int vlen_enc = vector_length_encoding(this);
 7736       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7737                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7738                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7739     } else {
 7740       int vlen_enc = vector_length_encoding(this, $src);
 7741       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7742                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7743                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7744     }
 7745   %}
 7746   ins_pipe( pipe_slow );
 7747 %}
 7748 
 7749 instruct castFtoX_reg_avx10(vec dst, vec src) %{
 7750   predicate(VM_Version::supports_avx10_2() &&
 7751             is_integral_type(Matcher::vector_element_basic_type(n)));
 7752   match(Set dst (VectorCastF2X src));
 7753   format %{ "vector_cast_f2x_avx10 $dst, $src\t!" %}
 7754   ins_encode %{
 7755     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7756     int vlen_enc = (to_elem_bt == T_LONG) ? vector_length_encoding(this) : vector_length_encoding(this, $src);
 7757     __ vector_castF2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7758   %}
 7759   ins_pipe( pipe_slow );
 7760 %}
 7761 
 7762 instruct castFtoX_mem_avx10(vec dst, memory src) %{
 7763   predicate(VM_Version::supports_avx10_2() &&
 7764             is_integral_type(Matcher::vector_element_basic_type(n)));
 7765   match(Set dst (VectorCastF2X (LoadVector src)));
 7766   format %{ "vector_cast_f2x_avx10 $dst, $src\t!" %}
 7767   ins_encode %{
 7768     int vlen = Matcher::vector_length(this);
 7769     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7770     int vlen_enc = (to_elem_bt == T_LONG) ? vector_length_encoding(this) : vector_length_encoding(vlen * sizeof(jfloat));
 7771     __ vector_castF2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$Address, vlen_enc);
 7772   %}
 7773   ins_pipe( pipe_slow );
 7774 %}
 7775 
 7776 instruct vcastDtoF_reg(vec dst, vec src) %{
 7777   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7778   match(Set dst (VectorCastD2X src));
 7779   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7780   ins_encode %{
 7781     int vlen_enc = vector_length_encoding(this, $src);
 7782     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7783   %}
 7784   ins_pipe( pipe_slow );
 7785 %}
 7786 
 7787 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7788   predicate(!VM_Version::supports_avx10_2() &&
 7789             !VM_Version::supports_avx512vl() &&
 7790             Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7791             is_integral_type(Matcher::vector_element_basic_type(n)));
 7792   match(Set dst (VectorCastD2X src));
 7793   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7794   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7795   ins_encode %{
 7796     int vlen_enc = vector_length_encoding(this, $src);
 7797     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7798     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7799                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7800                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7801   %}
 7802   ins_pipe( pipe_slow );
 7803 %}
 7804 
 7805 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7806   predicate(!VM_Version::supports_avx10_2() &&
 7807             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7808             is_integral_type(Matcher::vector_element_basic_type(n)));
 7809   match(Set dst (VectorCastD2X src));
 7810   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7811   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7812   ins_encode %{
 7813     int vlen_enc = vector_length_encoding(this, $src);
 7814     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7815     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7816                               ExternalAddress(vector_float_signflip());
 7817     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7818                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7819   %}
 7820   ins_pipe( pipe_slow );
 7821 %}
 7822 
 7823 instruct castDtoX_reg_avx10(vec dst, vec src) %{
 7824   predicate(VM_Version::supports_avx10_2() &&
 7825             is_integral_type(Matcher::vector_element_basic_type(n)));
 7826   match(Set dst (VectorCastD2X src));
 7827   format %{ "vector_cast_d2x_avx10 $dst, $src\t!" %}
 7828   ins_encode %{
 7829     int vlen_enc = vector_length_encoding(this, $src);
 7830     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7831     __ vector_castD2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7832   %}
 7833   ins_pipe( pipe_slow );
 7834 %}
 7835 
 7836 instruct castDtoX_mem_avx10(vec dst, memory src) %{
 7837   predicate(VM_Version::supports_avx10_2() &&
 7838             is_integral_type(Matcher::vector_element_basic_type(n)));
 7839   match(Set dst (VectorCastD2X (LoadVector src)));
 7840   format %{ "vector_cast_d2x_avx10 $dst, $src\t!" %}
 7841   ins_encode %{
 7842     int vlen = Matcher::vector_length(this);
 7843     int vlen_enc = vector_length_encoding(vlen * sizeof(jdouble));
 7844     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7845     __ vector_castD2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$Address, vlen_enc);
 7846   %}
 7847   ins_pipe( pipe_slow );
 7848 %}
 7849 
 7850 instruct vucast(vec dst, vec src) %{
 7851   match(Set dst (VectorUCastB2X src));
 7852   match(Set dst (VectorUCastS2X src));
 7853   match(Set dst (VectorUCastI2X src));
 7854   format %{ "vector_ucast $dst,$src\t!" %}
 7855   ins_encode %{
 7856     assert(UseAVX > 0, "required");
 7857 
 7858     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7859     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7860     int vlen_enc = vector_length_encoding(this);
 7861     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7862   %}
 7863   ins_pipe( pipe_slow );
 7864 %}
 7865 
 7866 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7867   predicate(!VM_Version::supports_avx512vl() &&
 7868             Matcher::vector_length_in_bytes(n) < 64 &&
 7869             Matcher::vector_element_basic_type(n) == T_INT);
 7870   match(Set dst (RoundVF src));
 7871   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7872   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7873   ins_encode %{
 7874     int vlen_enc = vector_length_encoding(this);
 7875     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7876     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7877                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7878                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7879   %}
 7880   ins_pipe( pipe_slow );
 7881 %}
 7882 
 7883 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7884   predicate((VM_Version::supports_avx512vl() ||
 7885              Matcher::vector_length_in_bytes(n) == 64) &&
 7886              Matcher::vector_element_basic_type(n) == T_INT);
 7887   match(Set dst (RoundVF src));
 7888   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7889   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7890   ins_encode %{
 7891     int vlen_enc = vector_length_encoding(this);
 7892     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7893     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7894                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7895                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7896   %}
 7897   ins_pipe( pipe_slow );
 7898 %}
 7899 
 7900 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7901   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7902   match(Set dst (RoundVD src));
 7903   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7904   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7905   ins_encode %{
 7906     int vlen_enc = vector_length_encoding(this);
 7907     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7908     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7909                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7910                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7911   %}
 7912   ins_pipe( pipe_slow );
 7913 %}
 7914 
 7915 // --------------------------------- VectorMaskCmp --------------------------------------
 7916 
 7917 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7918   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7919             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7920             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7921             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7922   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7923   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7924   ins_encode %{
 7925     int vlen_enc = vector_length_encoding(this, $src1);
 7926     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7927     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7928       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7929     } else {
 7930       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7931     }
 7932   %}
 7933   ins_pipe( pipe_slow );
 7934 %}
 7935 
 7936 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7937   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7938             n->bottom_type()->isa_vectmask() == nullptr &&
 7939             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7940   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7941   effect(TEMP ktmp);
 7942   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7943   ins_encode %{
 7944     int vlen_enc = Assembler::AVX_512bit;
 7945     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7946     KRegister mask = k0; // The comparison itself is not being masked.
 7947     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7948       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7949       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7950     } else {
 7951       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7952       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7953     }
 7954   %}
 7955   ins_pipe( pipe_slow );
 7956 %}
 7957 
 7958 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7959   predicate(n->bottom_type()->isa_vectmask() &&
 7960             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7961   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7962   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7963   ins_encode %{
 7964     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7965     int vlen_enc = vector_length_encoding(this, $src1);
 7966     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7967     KRegister mask = k0; // The comparison itself is not being masked.
 7968     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7969       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7970     } else {
 7971       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7972     }
 7973   %}
 7974   ins_pipe( pipe_slow );
 7975 %}
 7976 
 7977 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7978   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7979             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7980             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7981             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7982             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7983             (n->in(2)->get_int() == BoolTest::eq ||
 7984              n->in(2)->get_int() == BoolTest::lt ||
 7985              n->in(2)->get_int() == BoolTest::gt)); // cond
 7986   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7987   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7988   ins_encode %{
 7989     int vlen_enc = vector_length_encoding(this, $src1);
 7990     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7991     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7992     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7993   %}
 7994   ins_pipe( pipe_slow );
 7995 %}
 7996 
 7997 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7998   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7999             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8000             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8001             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8002             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 8003             (n->in(2)->get_int() == BoolTest::ne ||
 8004              n->in(2)->get_int() == BoolTest::le ||
 8005              n->in(2)->get_int() == BoolTest::ge)); // cond
 8006   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8007   effect(TEMP dst, TEMP xtmp);
 8008   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8009   ins_encode %{
 8010     int vlen_enc = vector_length_encoding(this, $src1);
 8011     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8012     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8013     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8014   %}
 8015   ins_pipe( pipe_slow );
 8016 %}
 8017 
 8018 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 8019   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 8020             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 8021             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 8022             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 8023             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8024   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8025   effect(TEMP dst, TEMP xtmp);
 8026   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 8027   ins_encode %{
 8028     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 8029     int vlen_enc = vector_length_encoding(this, $src1);
 8030     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8031     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 8032 
 8033     if (vlen_enc == Assembler::AVX_128bit) {
 8034       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8035     } else {
 8036       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8037     }
 8038     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8039     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8040     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8041   %}
 8042   ins_pipe( pipe_slow );
 8043 %}
 8044 
 8045 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8046   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8047              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8048              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8049   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8050   effect(TEMP ktmp);
 8051   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8052   ins_encode %{
 8053     assert(UseAVX > 2, "required");
 8054 
 8055     int vlen_enc = vector_length_encoding(this, $src1);
 8056     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8057     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8058     KRegister mask = k0; // The comparison itself is not being masked.
 8059     bool merge = false;
 8060     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8061 
 8062     switch (src1_elem_bt) {
 8063       case T_INT: {
 8064         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8065         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8066         break;
 8067       }
 8068       case T_LONG: {
 8069         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8070         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8071         break;
 8072       }
 8073       default: assert(false, "%s", type2name(src1_elem_bt));
 8074     }
 8075   %}
 8076   ins_pipe( pipe_slow );
 8077 %}
 8078 
 8079 
 8080 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8081   predicate(n->bottom_type()->isa_vectmask() &&
 8082             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8083   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8084   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8085   ins_encode %{
 8086     assert(UseAVX > 2, "required");
 8087     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8088 
 8089     int vlen_enc = vector_length_encoding(this, $src1);
 8090     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8091     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8092     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8093 
 8094     // Comparison i
 8095     switch (src1_elem_bt) {
 8096       case T_BYTE: {
 8097         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8098         break;
 8099       }
 8100       case T_SHORT: {
 8101         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8102         break;
 8103       }
 8104       case T_INT: {
 8105         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8106         break;
 8107       }
 8108       case T_LONG: {
 8109         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8110         break;
 8111       }
 8112       default: assert(false, "%s", type2name(src1_elem_bt));
 8113     }
 8114   %}
 8115   ins_pipe( pipe_slow );
 8116 %}
 8117 
 8118 // Extract
 8119 
 8120 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8121   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8122   match(Set dst (ExtractI src idx));
 8123   match(Set dst (ExtractS src idx));
 8124   match(Set dst (ExtractB src idx));
 8125   format %{ "extractI $dst,$src,$idx\t!" %}
 8126   ins_encode %{
 8127     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8128 
 8129     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8130     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8131   %}
 8132   ins_pipe( pipe_slow );
 8133 %}
 8134 
 8135 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8136   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8137             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8138   match(Set dst (ExtractI src idx));
 8139   match(Set dst (ExtractS src idx));
 8140   match(Set dst (ExtractB src idx));
 8141   effect(TEMP vtmp);
 8142   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8143   ins_encode %{
 8144     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8145 
 8146     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8147     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8148     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8149   %}
 8150   ins_pipe( pipe_slow );
 8151 %}
 8152 
 8153 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8154   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8155   match(Set dst (ExtractL src idx));
 8156   format %{ "extractL $dst,$src,$idx\t!" %}
 8157   ins_encode %{
 8158     assert(UseSSE >= 4, "required");
 8159     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8160 
 8161     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8162   %}
 8163   ins_pipe( pipe_slow );
 8164 %}
 8165 
 8166 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8167   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8168             Matcher::vector_length(n->in(1)) == 8);  // src
 8169   match(Set dst (ExtractL src idx));
 8170   effect(TEMP vtmp);
 8171   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8172   ins_encode %{
 8173     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8174 
 8175     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8176     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8177   %}
 8178   ins_pipe( pipe_slow );
 8179 %}
 8180 
 8181 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8182   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8183   match(Set dst (ExtractF src idx));
 8184   effect(TEMP dst, TEMP vtmp);
 8185   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8186   ins_encode %{
 8187     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8188 
 8189     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8190   %}
 8191   ins_pipe( pipe_slow );
 8192 %}
 8193 
 8194 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8195   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8196             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8197   match(Set dst (ExtractF src idx));
 8198   effect(TEMP vtmp);
 8199   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8200   ins_encode %{
 8201     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8202 
 8203     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8204     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8205   %}
 8206   ins_pipe( pipe_slow );
 8207 %}
 8208 
 8209 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8210   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8211   match(Set dst (ExtractD src idx));
 8212   format %{ "extractD $dst,$src,$idx\t!" %}
 8213   ins_encode %{
 8214     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8215 
 8216     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8217   %}
 8218   ins_pipe( pipe_slow );
 8219 %}
 8220 
 8221 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8222   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8223             Matcher::vector_length(n->in(1)) == 8);  // src
 8224   match(Set dst (ExtractD src idx));
 8225   effect(TEMP vtmp);
 8226   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8227   ins_encode %{
 8228     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8229 
 8230     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8231     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8232   %}
 8233   ins_pipe( pipe_slow );
 8234 %}
 8235 
 8236 // --------------------------------- Vector Blend --------------------------------------
 8237 
 8238 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8239   predicate(UseAVX == 0);
 8240   match(Set dst (VectorBlend (Binary dst src) mask));
 8241   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8242   effect(TEMP tmp);
 8243   ins_encode %{
 8244     assert(UseSSE >= 4, "required");
 8245 
 8246     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8247       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8248     }
 8249     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8250   %}
 8251   ins_pipe( pipe_slow );
 8252 %}
 8253 
 8254 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8255   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8256             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8257             Matcher::vector_length_in_bytes(n) <= 32 &&
 8258             is_integral_type(Matcher::vector_element_basic_type(n)));
 8259   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8260   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8261   ins_encode %{
 8262     int vlen_enc = vector_length_encoding(this);
 8263     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8264   %}
 8265   ins_pipe( pipe_slow );
 8266 %}
 8267 
 8268 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8269   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8270             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8271             Matcher::vector_length_in_bytes(n) <= 32 &&
 8272             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8273   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8274   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8275   ins_encode %{
 8276     int vlen_enc = vector_length_encoding(this);
 8277     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8278   %}
 8279   ins_pipe( pipe_slow );
 8280 %}
 8281 
 8282 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8283   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8284             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8285             Matcher::vector_length_in_bytes(n) <= 32);
 8286   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8287   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8288   effect(TEMP vtmp, TEMP dst);
 8289   ins_encode %{
 8290     int vlen_enc = vector_length_encoding(this);
 8291     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8292     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8293     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8294   %}
 8295   ins_pipe( pipe_slow );
 8296 %}
 8297 
 8298 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8299   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8300             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8301   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8302   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8303   effect(TEMP ktmp);
 8304   ins_encode %{
 8305      int vlen_enc = Assembler::AVX_512bit;
 8306      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8307     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8308     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8309   %}
 8310   ins_pipe( pipe_slow );
 8311 %}
 8312 
 8313 
 8314 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8315   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8316             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8317              VM_Version::supports_avx512bw()));
 8318   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8319   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8320   ins_encode %{
 8321     int vlen_enc = vector_length_encoding(this);
 8322     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8323     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8324   %}
 8325   ins_pipe( pipe_slow );
 8326 %}
 8327 
 8328 // --------------------------------- ABS --------------------------------------
 8329 // a = |a|
 8330 instruct vabsB_reg(vec dst, vec src) %{
 8331   match(Set dst (AbsVB  src));
 8332   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8333   ins_encode %{
 8334     uint vlen = Matcher::vector_length(this);
 8335     if (vlen <= 16) {
 8336       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8337     } else {
 8338       int vlen_enc = vector_length_encoding(this);
 8339       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8340     }
 8341   %}
 8342   ins_pipe( pipe_slow );
 8343 %}
 8344 
 8345 instruct vabsS_reg(vec dst, vec src) %{
 8346   match(Set dst (AbsVS  src));
 8347   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8348   ins_encode %{
 8349     uint vlen = Matcher::vector_length(this);
 8350     if (vlen <= 8) {
 8351       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8352     } else {
 8353       int vlen_enc = vector_length_encoding(this);
 8354       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8355     }
 8356   %}
 8357   ins_pipe( pipe_slow );
 8358 %}
 8359 
 8360 instruct vabsI_reg(vec dst, vec src) %{
 8361   match(Set dst (AbsVI  src));
 8362   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8363   ins_encode %{
 8364     uint vlen = Matcher::vector_length(this);
 8365     if (vlen <= 4) {
 8366       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8367     } else {
 8368       int vlen_enc = vector_length_encoding(this);
 8369       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8370     }
 8371   %}
 8372   ins_pipe( pipe_slow );
 8373 %}
 8374 
 8375 instruct vabsL_reg(vec dst, vec src) %{
 8376   match(Set dst (AbsVL  src));
 8377   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8378   ins_encode %{
 8379     assert(UseAVX > 2, "required");
 8380     int vlen_enc = vector_length_encoding(this);
 8381     if (!VM_Version::supports_avx512vl()) {
 8382       vlen_enc = Assembler::AVX_512bit;
 8383     }
 8384     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8385   %}
 8386   ins_pipe( pipe_slow );
 8387 %}
 8388 
 8389 // --------------------------------- ABSNEG --------------------------------------
 8390 
 8391 instruct vabsnegF(vec dst, vec src) %{
 8392   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8393   match(Set dst (AbsVF src));
 8394   match(Set dst (NegVF src));
 8395   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8396   ins_cost(150);
 8397   ins_encode %{
 8398     int opcode = this->ideal_Opcode();
 8399     int vlen = Matcher::vector_length(this);
 8400     if (vlen == 2) {
 8401       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8402     } else {
 8403       assert(vlen == 8 || vlen == 16, "required");
 8404       int vlen_enc = vector_length_encoding(this);
 8405       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8406     }
 8407   %}
 8408   ins_pipe( pipe_slow );
 8409 %}
 8410 
 8411 instruct vabsneg4F(vec dst) %{
 8412   predicate(Matcher::vector_length(n) == 4);
 8413   match(Set dst (AbsVF dst));
 8414   match(Set dst (NegVF dst));
 8415   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8416   ins_cost(150);
 8417   ins_encode %{
 8418     int opcode = this->ideal_Opcode();
 8419     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8420   %}
 8421   ins_pipe( pipe_slow );
 8422 %}
 8423 
 8424 instruct vabsnegD(vec dst, vec src) %{
 8425   match(Set dst (AbsVD  src));
 8426   match(Set dst (NegVD  src));
 8427   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8428   ins_encode %{
 8429     int opcode = this->ideal_Opcode();
 8430     uint vlen = Matcher::vector_length(this);
 8431     if (vlen == 2) {
 8432       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8433     } else {
 8434       int vlen_enc = vector_length_encoding(this);
 8435       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8436     }
 8437   %}
 8438   ins_pipe( pipe_slow );
 8439 %}
 8440 
 8441 //------------------------------------- VectorTest --------------------------------------------
 8442 
 8443 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8444   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8445   match(Set cr (VectorTest src1 src2));
 8446   effect(TEMP vtmp);
 8447   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8448   ins_encode %{
 8449     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8450     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8451     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8452   %}
 8453   ins_pipe( pipe_slow );
 8454 %}
 8455 
 8456 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8457   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8458   match(Set cr (VectorTest src1 src2));
 8459   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8460   ins_encode %{
 8461     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8462     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8463     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8464   %}
 8465   ins_pipe( pipe_slow );
 8466 %}
 8467 
 8468 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8469   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8470              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8471             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8472   match(Set cr (VectorTest src1 src2));
 8473   effect(TEMP tmp);
 8474   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8475   ins_encode %{
 8476     uint masklen = Matcher::vector_length(this, $src1);
 8477     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8478     __ andl($tmp$$Register, (1 << masklen) - 1);
 8479     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8480   %}
 8481   ins_pipe( pipe_slow );
 8482 %}
 8483 
 8484 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8485   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8486              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8487             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8488   match(Set cr (VectorTest src1 src2));
 8489   effect(TEMP tmp);
 8490   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8491   ins_encode %{
 8492     uint masklen = Matcher::vector_length(this, $src1);
 8493     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8494     __ andl($tmp$$Register, (1 << masklen) - 1);
 8495   %}
 8496   ins_pipe( pipe_slow );
 8497 %}
 8498 
 8499 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8500   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8501             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8502   match(Set cr (VectorTest src1 src2));
 8503   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8504   ins_encode %{
 8505     uint masklen = Matcher::vector_length(this, $src1);
 8506     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8507   %}
 8508   ins_pipe( pipe_slow );
 8509 %}
 8510 
 8511 //------------------------------------- LoadMask --------------------------------------------
 8512 
 8513 instruct loadMask(legVec dst, legVec src) %{
 8514   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8515   match(Set dst (VectorLoadMask src));
 8516   effect(TEMP dst);
 8517   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8518   ins_encode %{
 8519     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8520     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8521     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8522   %}
 8523   ins_pipe( pipe_slow );
 8524 %}
 8525 
 8526 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8527   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8528   match(Set dst (VectorLoadMask src));
 8529   effect(TEMP xtmp);
 8530   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8531   ins_encode %{
 8532     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8533                         true, Assembler::AVX_512bit);
 8534   %}
 8535   ins_pipe( pipe_slow );
 8536 %}
 8537 
 8538 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8539   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8540   match(Set dst (VectorLoadMask src));
 8541   effect(TEMP xtmp);
 8542   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8543   ins_encode %{
 8544     int vlen_enc = vector_length_encoding(in(1));
 8545     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8546                         false, vlen_enc);
 8547   %}
 8548   ins_pipe( pipe_slow );
 8549 %}
 8550 
 8551 //------------------------------------- StoreMask --------------------------------------------
 8552 
 8553 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8554   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8555   match(Set dst (VectorStoreMask src size));
 8556   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8557   ins_encode %{
 8558     int vlen = Matcher::vector_length(this);
 8559     if (vlen <= 16 && UseAVX <= 2) {
 8560       assert(UseSSE >= 3, "required");
 8561       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8562     } else {
 8563       assert(UseAVX > 0, "required");
 8564       int src_vlen_enc = vector_length_encoding(this, $src);
 8565       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8566     }
 8567   %}
 8568   ins_pipe( pipe_slow );
 8569 %}
 8570 
 8571 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8572   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8573   match(Set dst (VectorStoreMask src size));
 8574   effect(TEMP_DEF dst, TEMP xtmp);
 8575   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8576   ins_encode %{
 8577     int vlen_enc = Assembler::AVX_128bit;
 8578     int vlen = Matcher::vector_length(this);
 8579     if (vlen <= 8) {
 8580       assert(UseSSE >= 3, "required");
 8581       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8582       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8583       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8584     } else {
 8585       assert(UseAVX > 0, "required");
 8586       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8587       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8588       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8589     }
 8590   %}
 8591   ins_pipe( pipe_slow );
 8592 %}
 8593 
 8594 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8595   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8596   match(Set dst (VectorStoreMask src size));
 8597   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8598   effect(TEMP_DEF dst, TEMP xtmp);
 8599   ins_encode %{
 8600     int vlen_enc = Assembler::AVX_128bit;
 8601     int vlen = Matcher::vector_length(this);
 8602     if (vlen <= 4) {
 8603       assert(UseSSE >= 3, "required");
 8604       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8605       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8606       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8607       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8608     } else {
 8609       assert(UseAVX > 0, "required");
 8610       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8611       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8612       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8613       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8614       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8615     }
 8616   %}
 8617   ins_pipe( pipe_slow );
 8618 %}
 8619 
 8620 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8621   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8622   match(Set dst (VectorStoreMask src size));
 8623   effect(TEMP_DEF dst, TEMP xtmp);
 8624   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8625   ins_encode %{
 8626     assert(UseSSE >= 3, "required");
 8627     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8628     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8629     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8630     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8631     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8632   %}
 8633   ins_pipe( pipe_slow );
 8634 %}
 8635 
 8636 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8637   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8638   match(Set dst (VectorStoreMask src size));
 8639   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8640   effect(TEMP_DEF dst, TEMP vtmp);
 8641   ins_encode %{
 8642     int vlen_enc = Assembler::AVX_128bit;
 8643     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8644     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8645     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8646     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8647     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8648     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8649     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8650   %}
 8651   ins_pipe( pipe_slow );
 8652 %}
 8653 
 8654 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8655   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8656   match(Set dst (VectorStoreMask src size));
 8657   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8658   ins_encode %{
 8659     int src_vlen_enc = vector_length_encoding(this, $src);
 8660     int dst_vlen_enc = vector_length_encoding(this);
 8661     if (!VM_Version::supports_avx512vl()) {
 8662       src_vlen_enc = Assembler::AVX_512bit;
 8663     }
 8664     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8665     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8666   %}
 8667   ins_pipe( pipe_slow );
 8668 %}
 8669 
 8670 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8671   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8672   match(Set dst (VectorStoreMask src size));
 8673   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8674   ins_encode %{
 8675     int src_vlen_enc = vector_length_encoding(this, $src);
 8676     int dst_vlen_enc = vector_length_encoding(this);
 8677     if (!VM_Version::supports_avx512vl()) {
 8678       src_vlen_enc = Assembler::AVX_512bit;
 8679     }
 8680     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8681     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8682   %}
 8683   ins_pipe( pipe_slow );
 8684 %}
 8685 
 8686 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8687   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8688   match(Set dst (VectorStoreMask mask size));
 8689   effect(TEMP_DEF dst);
 8690   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8691   ins_encode %{
 8692     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8693     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8694                  false, Assembler::AVX_512bit, noreg);
 8695     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8696   %}
 8697   ins_pipe( pipe_slow );
 8698 %}
 8699 
 8700 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8701   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8702   match(Set dst (VectorStoreMask mask size));
 8703   effect(TEMP_DEF dst);
 8704   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8705   ins_encode %{
 8706     int dst_vlen_enc = vector_length_encoding(this);
 8707     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8708     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8709   %}
 8710   ins_pipe( pipe_slow );
 8711 %}
 8712 
 8713 instruct vmaskcast_evex(kReg dst) %{
 8714   match(Set dst (VectorMaskCast dst));
 8715   ins_cost(0);
 8716   format %{ "vector_mask_cast $dst" %}
 8717   ins_encode %{
 8718     // empty
 8719   %}
 8720   ins_pipe(empty);
 8721 %}
 8722 
 8723 instruct vmaskcast(vec dst) %{
 8724   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8725   match(Set dst (VectorMaskCast dst));
 8726   ins_cost(0);
 8727   format %{ "vector_mask_cast $dst" %}
 8728   ins_encode %{
 8729     // empty
 8730   %}
 8731   ins_pipe(empty);
 8732 %}
 8733 
 8734 instruct vmaskcast_avx(vec dst, vec src) %{
 8735   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8736   match(Set dst (VectorMaskCast src));
 8737   format %{ "vector_mask_cast $dst, $src" %}
 8738   ins_encode %{
 8739     int vlen = Matcher::vector_length(this);
 8740     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8741     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8742     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8743   %}
 8744   ins_pipe(pipe_slow);
 8745 %}
 8746 
 8747 //-------------------------------- Load Iota Indices ----------------------------------
 8748 
 8749 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8750   match(Set dst (VectorLoadConst src));
 8751   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8752   ins_encode %{
 8753      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8754      BasicType bt = Matcher::vector_element_basic_type(this);
 8755      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8756   %}
 8757   ins_pipe( pipe_slow );
 8758 %}
 8759 
 8760 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8761   match(Set dst (PopulateIndex src1 src2));
 8762   effect(TEMP dst, TEMP vtmp);
 8763   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8764   ins_encode %{
 8765      assert($src2$$constant == 1, "required");
 8766      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8767      int vlen_enc = vector_length_encoding(this);
 8768      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8769      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8770      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8771      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8772   %}
 8773   ins_pipe( pipe_slow );
 8774 %}
 8775 
 8776 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8777   match(Set dst (PopulateIndex src1 src2));
 8778   effect(TEMP dst, TEMP vtmp);
 8779   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8780   ins_encode %{
 8781      assert($src2$$constant == 1, "required");
 8782      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8783      int vlen_enc = vector_length_encoding(this);
 8784      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8785      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8786      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8787      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8788   %}
 8789   ins_pipe( pipe_slow );
 8790 %}
 8791 
 8792 //-------------------------------- Rearrange ----------------------------------
 8793 
 8794 // LoadShuffle/Rearrange for Byte
 8795 instruct rearrangeB(vec dst, vec shuffle) %{
 8796   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8797             Matcher::vector_length(n) < 32);
 8798   match(Set dst (VectorRearrange dst shuffle));
 8799   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8800   ins_encode %{
 8801     assert(UseSSE >= 4, "required");
 8802     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8803   %}
 8804   ins_pipe( pipe_slow );
 8805 %}
 8806 
 8807 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8808   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8809             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8810   match(Set dst (VectorRearrange src shuffle));
 8811   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8812   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8813   ins_encode %{
 8814     assert(UseAVX >= 2, "required");
 8815     // Swap src into vtmp1
 8816     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8817     // Shuffle swapped src to get entries from other 128 bit lane
 8818     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8819     // Shuffle original src to get entries from self 128 bit lane
 8820     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8821     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8822     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8823     // Perform the blend
 8824     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8825   %}
 8826   ins_pipe( pipe_slow );
 8827 %}
 8828 
 8829 
 8830 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8831   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8832             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8833   match(Set dst (VectorRearrange src shuffle));
 8834   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8835   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8836   ins_encode %{
 8837     int vlen_enc = vector_length_encoding(this);
 8838     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8839                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8840                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8841   %}
 8842   ins_pipe( pipe_slow );
 8843 %}
 8844 
 8845 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8846   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8847             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8848   match(Set dst (VectorRearrange src shuffle));
 8849   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8850   ins_encode %{
 8851     int vlen_enc = vector_length_encoding(this);
 8852     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8853   %}
 8854   ins_pipe( pipe_slow );
 8855 %}
 8856 
 8857 // LoadShuffle/Rearrange for Short
 8858 
 8859 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8860   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8861             !VM_Version::supports_avx512bw());
 8862   match(Set dst (VectorLoadShuffle src));
 8863   effect(TEMP dst, TEMP vtmp);
 8864   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8865   ins_encode %{
 8866     // Create a byte shuffle mask from short shuffle mask
 8867     // only byte shuffle instruction available on these platforms
 8868     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8869     if (UseAVX == 0) {
 8870       assert(vlen_in_bytes <= 16, "required");
 8871       // Multiply each shuffle by two to get byte index
 8872       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8873       __ psllw($vtmp$$XMMRegister, 1);
 8874 
 8875       // Duplicate to create 2 copies of byte index
 8876       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8877       __ psllw($dst$$XMMRegister, 8);
 8878       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8879 
 8880       // Add one to get alternate byte index
 8881       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8882       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8883     } else {
 8884       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8885       int vlen_enc = vector_length_encoding(this);
 8886       // Multiply each shuffle by two to get byte index
 8887       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8888 
 8889       // Duplicate to create 2 copies of byte index
 8890       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8891       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8892 
 8893       // Add one to get alternate byte index
 8894       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8895     }
 8896   %}
 8897   ins_pipe( pipe_slow );
 8898 %}
 8899 
 8900 instruct rearrangeS(vec dst, vec shuffle) %{
 8901   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8902             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8903   match(Set dst (VectorRearrange dst shuffle));
 8904   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8905   ins_encode %{
 8906     assert(UseSSE >= 4, "required");
 8907     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8908   %}
 8909   ins_pipe( pipe_slow );
 8910 %}
 8911 
 8912 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8913   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8914             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8915   match(Set dst (VectorRearrange src shuffle));
 8916   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8917   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8918   ins_encode %{
 8919     assert(UseAVX >= 2, "required");
 8920     // Swap src into vtmp1
 8921     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8922     // Shuffle swapped src to get entries from other 128 bit lane
 8923     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8924     // Shuffle original src to get entries from self 128 bit lane
 8925     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8926     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8927     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8928     // Perform the blend
 8929     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8930   %}
 8931   ins_pipe( pipe_slow );
 8932 %}
 8933 
 8934 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8935   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8936             VM_Version::supports_avx512bw());
 8937   match(Set dst (VectorRearrange src shuffle));
 8938   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8939   ins_encode %{
 8940     int vlen_enc = vector_length_encoding(this);
 8941     if (!VM_Version::supports_avx512vl()) {
 8942       vlen_enc = Assembler::AVX_512bit;
 8943     }
 8944     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8945   %}
 8946   ins_pipe( pipe_slow );
 8947 %}
 8948 
 8949 // LoadShuffle/Rearrange for Integer and Float
 8950 
 8951 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8952   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8953             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8954   match(Set dst (VectorLoadShuffle src));
 8955   effect(TEMP dst, TEMP vtmp);
 8956   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8957   ins_encode %{
 8958     assert(UseSSE >= 4, "required");
 8959 
 8960     // Create a byte shuffle mask from int shuffle mask
 8961     // only byte shuffle instruction available on these platforms
 8962 
 8963     // Duplicate and multiply each shuffle by 4
 8964     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8965     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8966     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8967     __ psllw($vtmp$$XMMRegister, 2);
 8968 
 8969     // Duplicate again to create 4 copies of byte index
 8970     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8971     __ psllw($dst$$XMMRegister, 8);
 8972     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8973 
 8974     // Add 3,2,1,0 to get alternate byte index
 8975     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8976     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8977   %}
 8978   ins_pipe( pipe_slow );
 8979 %}
 8980 
 8981 instruct rearrangeI(vec dst, vec shuffle) %{
 8982   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8983             UseAVX == 0);
 8984   match(Set dst (VectorRearrange dst shuffle));
 8985   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8986   ins_encode %{
 8987     assert(UseSSE >= 4, "required");
 8988     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8989   %}
 8990   ins_pipe( pipe_slow );
 8991 %}
 8992 
 8993 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8994   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8995             UseAVX > 0);
 8996   match(Set dst (VectorRearrange src shuffle));
 8997   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8998   ins_encode %{
 8999     int vlen_enc = vector_length_encoding(this);
 9000     BasicType bt = Matcher::vector_element_basic_type(this);
 9001     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9002   %}
 9003   ins_pipe( pipe_slow );
 9004 %}
 9005 
 9006 // LoadShuffle/Rearrange for Long and Double
 9007 
 9008 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 9009   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9010             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9011   match(Set dst (VectorLoadShuffle src));
 9012   effect(TEMP dst, TEMP vtmp);
 9013   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 9014   ins_encode %{
 9015     assert(UseAVX >= 2, "required");
 9016 
 9017     int vlen_enc = vector_length_encoding(this);
 9018     // Create a double word shuffle mask from long shuffle mask
 9019     // only double word shuffle instruction available on these platforms
 9020 
 9021     // Multiply each shuffle by two to get double word index
 9022     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 9023 
 9024     // Duplicate each double word shuffle
 9025     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 9026     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 9027 
 9028     // Add one to get alternate double word index
 9029     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 9030   %}
 9031   ins_pipe( pipe_slow );
 9032 %}
 9033 
 9034 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9035   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9036             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9037   match(Set dst (VectorRearrange src shuffle));
 9038   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9039   ins_encode %{
 9040     assert(UseAVX >= 2, "required");
 9041 
 9042     int vlen_enc = vector_length_encoding(this);
 9043     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9044   %}
 9045   ins_pipe( pipe_slow );
 9046 %}
 9047 
 9048 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9049   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9050             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9051   match(Set dst (VectorRearrange src shuffle));
 9052   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9053   ins_encode %{
 9054     assert(UseAVX > 2, "required");
 9055 
 9056     int vlen_enc = vector_length_encoding(this);
 9057     if (vlen_enc == Assembler::AVX_128bit) {
 9058       vlen_enc = Assembler::AVX_256bit;
 9059     }
 9060     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9061   %}
 9062   ins_pipe( pipe_slow );
 9063 %}
 9064 
 9065 // --------------------------------- FMA --------------------------------------
 9066 // a * b + c
 9067 
 9068 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9069   match(Set c (FmaVF  c (Binary a b)));
 9070   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9071   ins_cost(150);
 9072   ins_encode %{
 9073     assert(UseFMA, "not enabled");
 9074     int vlen_enc = vector_length_encoding(this);
 9075     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9076   %}
 9077   ins_pipe( pipe_slow );
 9078 %}
 9079 
 9080 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9081   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9082   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9083   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9084   ins_cost(150);
 9085   ins_encode %{
 9086     assert(UseFMA, "not enabled");
 9087     int vlen_enc = vector_length_encoding(this);
 9088     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9089   %}
 9090   ins_pipe( pipe_slow );
 9091 %}
 9092 
 9093 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9094   match(Set c (FmaVD  c (Binary a b)));
 9095   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9096   ins_cost(150);
 9097   ins_encode %{
 9098     assert(UseFMA, "not enabled");
 9099     int vlen_enc = vector_length_encoding(this);
 9100     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9101   %}
 9102   ins_pipe( pipe_slow );
 9103 %}
 9104 
 9105 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9106   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9107   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9108   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9109   ins_cost(150);
 9110   ins_encode %{
 9111     assert(UseFMA, "not enabled");
 9112     int vlen_enc = vector_length_encoding(this);
 9113     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9114   %}
 9115   ins_pipe( pipe_slow );
 9116 %}
 9117 
 9118 // --------------------------------- Vector Multiply Add --------------------------------------
 9119 
 9120 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9121   predicate(UseAVX == 0);
 9122   match(Set dst (MulAddVS2VI dst src1));
 9123   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9124   ins_encode %{
 9125     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9126   %}
 9127   ins_pipe( pipe_slow );
 9128 %}
 9129 
 9130 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9131   predicate(UseAVX > 0);
 9132   match(Set dst (MulAddVS2VI src1 src2));
 9133   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9134   ins_encode %{
 9135     int vlen_enc = vector_length_encoding(this);
 9136     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9137   %}
 9138   ins_pipe( pipe_slow );
 9139 %}
 9140 
 9141 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9142 
 9143 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9144   predicate(VM_Version::supports_avx512_vnni());
 9145   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9146   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9147   ins_encode %{
 9148     assert(UseAVX > 2, "required");
 9149     int vlen_enc = vector_length_encoding(this);
 9150     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9151   %}
 9152   ins_pipe( pipe_slow );
 9153   ins_cost(10);
 9154 %}
 9155 
 9156 // --------------------------------- PopCount --------------------------------------
 9157 
 9158 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9159   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9160   match(Set dst (PopCountVI src));
 9161   match(Set dst (PopCountVL src));
 9162   format %{ "vector_popcount_integral $dst, $src" %}
 9163   ins_encode %{
 9164     int opcode = this->ideal_Opcode();
 9165     int vlen_enc = vector_length_encoding(this, $src);
 9166     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9167     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9168   %}
 9169   ins_pipe( pipe_slow );
 9170 %}
 9171 
 9172 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9173   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9174   match(Set dst (PopCountVI src mask));
 9175   match(Set dst (PopCountVL src mask));
 9176   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9177   ins_encode %{
 9178     int vlen_enc = vector_length_encoding(this, $src);
 9179     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9180     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9181     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9182   %}
 9183   ins_pipe( pipe_slow );
 9184 %}
 9185 
 9186 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9187   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9188   match(Set dst (PopCountVI src));
 9189   match(Set dst (PopCountVL src));
 9190   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9191   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9192   ins_encode %{
 9193     int opcode = this->ideal_Opcode();
 9194     int vlen_enc = vector_length_encoding(this, $src);
 9195     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9196     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9197                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9198   %}
 9199   ins_pipe( pipe_slow );
 9200 %}
 9201 
 9202 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9203 
 9204 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9205   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9206                                               Matcher::vector_length_in_bytes(n->in(1))));
 9207   match(Set dst (CountTrailingZerosV src));
 9208   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9209   ins_cost(400);
 9210   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9211   ins_encode %{
 9212     int vlen_enc = vector_length_encoding(this, $src);
 9213     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9214     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9215                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9216   %}
 9217   ins_pipe( pipe_slow );
 9218 %}
 9219 
 9220 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9221   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9222             VM_Version::supports_avx512cd() &&
 9223             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9224   match(Set dst (CountTrailingZerosV src));
 9225   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9226   ins_cost(400);
 9227   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9228   ins_encode %{
 9229     int vlen_enc = vector_length_encoding(this, $src);
 9230     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9231     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9232                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9233   %}
 9234   ins_pipe( pipe_slow );
 9235 %}
 9236 
 9237 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9238   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9239   match(Set dst (CountTrailingZerosV src));
 9240   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9241   ins_cost(400);
 9242   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9243   ins_encode %{
 9244     int vlen_enc = vector_length_encoding(this, $src);
 9245     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9246     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9247                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9248                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9249   %}
 9250   ins_pipe( pipe_slow );
 9251 %}
 9252 
 9253 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9254   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9255   match(Set dst (CountTrailingZerosV src));
 9256   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9257   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9258   ins_encode %{
 9259     int vlen_enc = vector_length_encoding(this, $src);
 9260     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9261     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9262                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9263   %}
 9264   ins_pipe( pipe_slow );
 9265 %}
 9266 
 9267 
 9268 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9269 
 9270 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9271   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9272   effect(TEMP dst);
 9273   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9274   ins_encode %{
 9275     int vector_len = vector_length_encoding(this);
 9276     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9277   %}
 9278   ins_pipe( pipe_slow );
 9279 %}
 9280 
 9281 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9282   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9283   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9284   effect(TEMP dst);
 9285   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9286   ins_encode %{
 9287     int vector_len = vector_length_encoding(this);
 9288     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9289   %}
 9290   ins_pipe( pipe_slow );
 9291 %}
 9292 
 9293 // --------------------------------- Rotation Operations ----------------------------------
 9294 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9295   match(Set dst (RotateLeftV src shift));
 9296   match(Set dst (RotateRightV src shift));
 9297   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9298   ins_encode %{
 9299     int opcode      = this->ideal_Opcode();
 9300     int vector_len  = vector_length_encoding(this);
 9301     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9302     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9303   %}
 9304   ins_pipe( pipe_slow );
 9305 %}
 9306 
 9307 instruct vprorate(vec dst, vec src, vec shift) %{
 9308   match(Set dst (RotateLeftV src shift));
 9309   match(Set dst (RotateRightV src shift));
 9310   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9311   ins_encode %{
 9312     int opcode      = this->ideal_Opcode();
 9313     int vector_len  = vector_length_encoding(this);
 9314     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9315     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9316   %}
 9317   ins_pipe( pipe_slow );
 9318 %}
 9319 
 9320 // ---------------------------------- Masked Operations ------------------------------------
 9321 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9322   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9323   match(Set dst (LoadVectorMasked mem mask));
 9324   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9325   ins_encode %{
 9326     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9327     int vlen_enc = vector_length_encoding(this);
 9328     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9329   %}
 9330   ins_pipe( pipe_slow );
 9331 %}
 9332 
 9333 
 9334 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9335   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9336   match(Set dst (LoadVectorMasked mem mask));
 9337   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9338   ins_encode %{
 9339     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9340     int vector_len = vector_length_encoding(this);
 9341     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9342   %}
 9343   ins_pipe( pipe_slow );
 9344 %}
 9345 
 9346 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9347   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9348   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9349   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9350   ins_encode %{
 9351     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9352     int vlen_enc = vector_length_encoding(src_node);
 9353     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9354     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9355   %}
 9356   ins_pipe( pipe_slow );
 9357 %}
 9358 
 9359 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9360   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9361   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9362   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9363   ins_encode %{
 9364     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9365     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9366     int vlen_enc = vector_length_encoding(src_node);
 9367     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9368   %}
 9369   ins_pipe( pipe_slow );
 9370 %}
 9371 
 9372 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9373   match(Set addr (VerifyVectorAlignment addr mask));
 9374   effect(KILL cr);
 9375   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9376   ins_encode %{
 9377     Label Lskip;
 9378     // check if masked bits of addr are zero
 9379     __ testq($addr$$Register, $mask$$constant);
 9380     __ jccb(Assembler::equal, Lskip);
 9381     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9382     __ bind(Lskip);
 9383   %}
 9384   ins_pipe(pipe_slow);
 9385 %}
 9386 
 9387 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9388   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9389   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9390   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9391   ins_encode %{
 9392     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9393     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9394 
 9395     Label DONE;
 9396     int vlen_enc = vector_length_encoding(this, $src1);
 9397     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9398 
 9399     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9400     __ mov64($dst$$Register, -1L);
 9401     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9402     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9403     __ jccb(Assembler::carrySet, DONE);
 9404     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9405     __ notq($dst$$Register);
 9406     __ tzcntq($dst$$Register, $dst$$Register);
 9407     __ bind(DONE);
 9408   %}
 9409   ins_pipe( pipe_slow );
 9410 %}
 9411 
 9412 
 9413 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9414   match(Set dst (VectorMaskGen len));
 9415   effect(TEMP temp, KILL cr);
 9416   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9417   ins_encode %{
 9418     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9419   %}
 9420   ins_pipe( pipe_slow );
 9421 %}
 9422 
 9423 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9424   match(Set dst (VectorMaskGen len));
 9425   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9426   effect(TEMP temp);
 9427   ins_encode %{
 9428     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9429     __ kmovql($dst$$KRegister, $temp$$Register);
 9430   %}
 9431   ins_pipe( pipe_slow );
 9432 %}
 9433 
 9434 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9435   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9436   match(Set dst (VectorMaskToLong mask));
 9437   effect(TEMP dst, KILL cr);
 9438   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9439   ins_encode %{
 9440     int opcode = this->ideal_Opcode();
 9441     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9442     int mask_len = Matcher::vector_length(this, $mask);
 9443     int mask_size = mask_len * type2aelembytes(mbt);
 9444     int vlen_enc = vector_length_encoding(this, $mask);
 9445     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9446                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9447   %}
 9448   ins_pipe( pipe_slow );
 9449 %}
 9450 
 9451 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9452   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9453   match(Set dst (VectorMaskToLong mask));
 9454   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9455   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9456   ins_encode %{
 9457     int opcode = this->ideal_Opcode();
 9458     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9459     int mask_len = Matcher::vector_length(this, $mask);
 9460     int vlen_enc = vector_length_encoding(this, $mask);
 9461     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9462                              $dst$$Register, mask_len, mbt, vlen_enc);
 9463   %}
 9464   ins_pipe( pipe_slow );
 9465 %}
 9466 
 9467 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9468   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9469   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9470   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9471   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9472   ins_encode %{
 9473     int opcode = this->ideal_Opcode();
 9474     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9475     int mask_len = Matcher::vector_length(this, $mask);
 9476     int vlen_enc = vector_length_encoding(this, $mask);
 9477     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9478                              $dst$$Register, mask_len, mbt, vlen_enc);
 9479   %}
 9480   ins_pipe( pipe_slow );
 9481 %}
 9482 
 9483 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9484   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9485   match(Set dst (VectorMaskTrueCount mask));
 9486   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9487   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9488   ins_encode %{
 9489     int opcode = this->ideal_Opcode();
 9490     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9491     int mask_len = Matcher::vector_length(this, $mask);
 9492     int mask_size = mask_len * type2aelembytes(mbt);
 9493     int vlen_enc = vector_length_encoding(this, $mask);
 9494     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9495                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9496   %}
 9497   ins_pipe( pipe_slow );
 9498 %}
 9499 
 9500 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9501   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9502   match(Set dst (VectorMaskTrueCount mask));
 9503   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9504   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9505   ins_encode %{
 9506     int opcode = this->ideal_Opcode();
 9507     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9508     int mask_len = Matcher::vector_length(this, $mask);
 9509     int vlen_enc = vector_length_encoding(this, $mask);
 9510     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9511                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9512   %}
 9513   ins_pipe( pipe_slow );
 9514 %}
 9515 
 9516 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9517   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9518   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9519   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9520   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9521   ins_encode %{
 9522     int opcode = this->ideal_Opcode();
 9523     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9524     int mask_len = Matcher::vector_length(this, $mask);
 9525     int vlen_enc = vector_length_encoding(this, $mask);
 9526     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9527                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9528   %}
 9529   ins_pipe( pipe_slow );
 9530 %}
 9531 
 9532 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9533   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9534   match(Set dst (VectorMaskFirstTrue mask));
 9535   match(Set dst (VectorMaskLastTrue mask));
 9536   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9537   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9538   ins_encode %{
 9539     int opcode = this->ideal_Opcode();
 9540     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9541     int mask_len = Matcher::vector_length(this, $mask);
 9542     int mask_size = mask_len * type2aelembytes(mbt);
 9543     int vlen_enc = vector_length_encoding(this, $mask);
 9544     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9545                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9546   %}
 9547   ins_pipe( pipe_slow );
 9548 %}
 9549 
 9550 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9551   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9552   match(Set dst (VectorMaskFirstTrue mask));
 9553   match(Set dst (VectorMaskLastTrue mask));
 9554   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9555   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9556   ins_encode %{
 9557     int opcode = this->ideal_Opcode();
 9558     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9559     int mask_len = Matcher::vector_length(this, $mask);
 9560     int vlen_enc = vector_length_encoding(this, $mask);
 9561     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9562                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9563   %}
 9564   ins_pipe( pipe_slow );
 9565 %}
 9566 
 9567 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9568   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9569   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9570   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9571   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9572   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9573   ins_encode %{
 9574     int opcode = this->ideal_Opcode();
 9575     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9576     int mask_len = Matcher::vector_length(this, $mask);
 9577     int vlen_enc = vector_length_encoding(this, $mask);
 9578     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9579                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9580   %}
 9581   ins_pipe( pipe_slow );
 9582 %}
 9583 
 9584 // --------------------------------- Compress/Expand Operations ---------------------------
 9585 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9586   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9587   match(Set dst (CompressV src mask));
 9588   match(Set dst (ExpandV src mask));
 9589   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9590   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9591   ins_encode %{
 9592     int opcode = this->ideal_Opcode();
 9593     int vlen_enc = vector_length_encoding(this);
 9594     BasicType bt  = Matcher::vector_element_basic_type(this);
 9595     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9596                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9597   %}
 9598   ins_pipe( pipe_slow );
 9599 %}
 9600 
 9601 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9602   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9603   match(Set dst (CompressV src mask));
 9604   match(Set dst (ExpandV src mask));
 9605   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9606   ins_encode %{
 9607     int opcode = this->ideal_Opcode();
 9608     int vector_len = vector_length_encoding(this);
 9609     BasicType bt  = Matcher::vector_element_basic_type(this);
 9610     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9611   %}
 9612   ins_pipe( pipe_slow );
 9613 %}
 9614 
 9615 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9616   match(Set dst (CompressM mask));
 9617   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9618   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9619   ins_encode %{
 9620     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9621     int mask_len = Matcher::vector_length(this);
 9622     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9623   %}
 9624   ins_pipe( pipe_slow );
 9625 %}
 9626 
 9627 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9628 
 9629 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9630   predicate(!VM_Version::supports_gfni());
 9631   match(Set dst (ReverseV src));
 9632   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9633   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9634   ins_encode %{
 9635     int vec_enc = vector_length_encoding(this);
 9636     BasicType bt = Matcher::vector_element_basic_type(this);
 9637     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9638                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9639   %}
 9640   ins_pipe( pipe_slow );
 9641 %}
 9642 
 9643 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9644   predicate(VM_Version::supports_gfni());
 9645   match(Set dst (ReverseV src));
 9646   effect(TEMP dst, TEMP xtmp);
 9647   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9648   ins_encode %{
 9649     int vec_enc = vector_length_encoding(this);
 9650     BasicType bt  = Matcher::vector_element_basic_type(this);
 9651     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9652     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9653                                $xtmp$$XMMRegister);
 9654   %}
 9655   ins_pipe( pipe_slow );
 9656 %}
 9657 
 9658 instruct vreverse_byte_reg(vec dst, vec src) %{
 9659   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9660   match(Set dst (ReverseBytesV src));
 9661   effect(TEMP dst);
 9662   format %{ "vector_reverse_byte $dst, $src" %}
 9663   ins_encode %{
 9664     int vec_enc = vector_length_encoding(this);
 9665     BasicType bt = Matcher::vector_element_basic_type(this);
 9666     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9667   %}
 9668   ins_pipe( pipe_slow );
 9669 %}
 9670 
 9671 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9672   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9673   match(Set dst (ReverseBytesV src));
 9674   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9675   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9676   ins_encode %{
 9677     int vec_enc = vector_length_encoding(this);
 9678     BasicType bt = Matcher::vector_element_basic_type(this);
 9679     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9680                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9681   %}
 9682   ins_pipe( pipe_slow );
 9683 %}
 9684 
 9685 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9686 
 9687 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9688   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9689                                               Matcher::vector_length_in_bytes(n->in(1))));
 9690   match(Set dst (CountLeadingZerosV src));
 9691   format %{ "vector_count_leading_zeros $dst, $src" %}
 9692   ins_encode %{
 9693      int vlen_enc = vector_length_encoding(this, $src);
 9694      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9695      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9696                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9697   %}
 9698   ins_pipe( pipe_slow );
 9699 %}
 9700 
 9701 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9702   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9703                                               Matcher::vector_length_in_bytes(n->in(1))));
 9704   match(Set dst (CountLeadingZerosV src mask));
 9705   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9706   ins_encode %{
 9707     int vlen_enc = vector_length_encoding(this, $src);
 9708     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9709     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9710     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9711                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9712   %}
 9713   ins_pipe( pipe_slow );
 9714 %}
 9715 
 9716 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9717   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9718             VM_Version::supports_avx512cd() &&
 9719             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9720   match(Set dst (CountLeadingZerosV src));
 9721   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9722   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9723   ins_encode %{
 9724     int vlen_enc = vector_length_encoding(this, $src);
 9725     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9726     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9727                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9728   %}
 9729   ins_pipe( pipe_slow );
 9730 %}
 9731 
 9732 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9733   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9734   match(Set dst (CountLeadingZerosV src));
 9735   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9736   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9737   ins_encode %{
 9738     int vlen_enc = vector_length_encoding(this, $src);
 9739     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9740     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9741                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9742                                        $rtmp$$Register, true, vlen_enc);
 9743   %}
 9744   ins_pipe( pipe_slow );
 9745 %}
 9746 
 9747 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9748   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9749             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9750   match(Set dst (CountLeadingZerosV src));
 9751   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9752   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9753   ins_encode %{
 9754     int vlen_enc = vector_length_encoding(this, $src);
 9755     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9756     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9757                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9758   %}
 9759   ins_pipe( pipe_slow );
 9760 %}
 9761 
 9762 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9763   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9764             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9765   match(Set dst (CountLeadingZerosV src));
 9766   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9767   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9768   ins_encode %{
 9769     int vlen_enc = vector_length_encoding(this, $src);
 9770     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9771     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9772                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9773   %}
 9774   ins_pipe( pipe_slow );
 9775 %}
 9776 
 9777 // ---------------------------------- Vector Masked Operations ------------------------------------
 9778 
 9779 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9780   match(Set dst (AddVB (Binary dst src2) mask));
 9781   match(Set dst (AddVS (Binary dst src2) mask));
 9782   match(Set dst (AddVI (Binary dst src2) mask));
 9783   match(Set dst (AddVL (Binary dst src2) mask));
 9784   match(Set dst (AddVF (Binary dst src2) mask));
 9785   match(Set dst (AddVD (Binary dst src2) mask));
 9786   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9787   ins_encode %{
 9788     int vlen_enc = vector_length_encoding(this);
 9789     BasicType bt = Matcher::vector_element_basic_type(this);
 9790     int opc = this->ideal_Opcode();
 9791     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9792                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9793   %}
 9794   ins_pipe( pipe_slow );
 9795 %}
 9796 
 9797 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9798   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9799   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9800   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9801   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9802   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9803   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9804   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9805   ins_encode %{
 9806     int vlen_enc = vector_length_encoding(this);
 9807     BasicType bt = Matcher::vector_element_basic_type(this);
 9808     int opc = this->ideal_Opcode();
 9809     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9810                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9811   %}
 9812   ins_pipe( pipe_slow );
 9813 %}
 9814 
 9815 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9816   match(Set dst (XorV (Binary dst src2) mask));
 9817   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9818   ins_encode %{
 9819     int vlen_enc = vector_length_encoding(this);
 9820     BasicType bt = Matcher::vector_element_basic_type(this);
 9821     int opc = this->ideal_Opcode();
 9822     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9823                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9824   %}
 9825   ins_pipe( pipe_slow );
 9826 %}
 9827 
 9828 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9829   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9830   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9831   ins_encode %{
 9832     int vlen_enc = vector_length_encoding(this);
 9833     BasicType bt = Matcher::vector_element_basic_type(this);
 9834     int opc = this->ideal_Opcode();
 9835     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9836                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9837   %}
 9838   ins_pipe( pipe_slow );
 9839 %}
 9840 
 9841 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9842   match(Set dst (OrV (Binary dst src2) mask));
 9843   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9844   ins_encode %{
 9845     int vlen_enc = vector_length_encoding(this);
 9846     BasicType bt = Matcher::vector_element_basic_type(this);
 9847     int opc = this->ideal_Opcode();
 9848     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9849                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9850   %}
 9851   ins_pipe( pipe_slow );
 9852 %}
 9853 
 9854 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9855   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9856   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9857   ins_encode %{
 9858     int vlen_enc = vector_length_encoding(this);
 9859     BasicType bt = Matcher::vector_element_basic_type(this);
 9860     int opc = this->ideal_Opcode();
 9861     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9862                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9863   %}
 9864   ins_pipe( pipe_slow );
 9865 %}
 9866 
 9867 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9868   match(Set dst (AndV (Binary dst src2) mask));
 9869   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9870   ins_encode %{
 9871     int vlen_enc = vector_length_encoding(this);
 9872     BasicType bt = Matcher::vector_element_basic_type(this);
 9873     int opc = this->ideal_Opcode();
 9874     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9875                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9876   %}
 9877   ins_pipe( pipe_slow );
 9878 %}
 9879 
 9880 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9881   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9882   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9883   ins_encode %{
 9884     int vlen_enc = vector_length_encoding(this);
 9885     BasicType bt = Matcher::vector_element_basic_type(this);
 9886     int opc = this->ideal_Opcode();
 9887     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9888                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9889   %}
 9890   ins_pipe( pipe_slow );
 9891 %}
 9892 
 9893 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9894   match(Set dst (SubVB (Binary dst src2) mask));
 9895   match(Set dst (SubVS (Binary dst src2) mask));
 9896   match(Set dst (SubVI (Binary dst src2) mask));
 9897   match(Set dst (SubVL (Binary dst src2) mask));
 9898   match(Set dst (SubVF (Binary dst src2) mask));
 9899   match(Set dst (SubVD (Binary dst src2) mask));
 9900   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9901   ins_encode %{
 9902     int vlen_enc = vector_length_encoding(this);
 9903     BasicType bt = Matcher::vector_element_basic_type(this);
 9904     int opc = this->ideal_Opcode();
 9905     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9906                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9907   %}
 9908   ins_pipe( pipe_slow );
 9909 %}
 9910 
 9911 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9912   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9913   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9914   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9915   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9916   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9917   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9918   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9919   ins_encode %{
 9920     int vlen_enc = vector_length_encoding(this);
 9921     BasicType bt = Matcher::vector_element_basic_type(this);
 9922     int opc = this->ideal_Opcode();
 9923     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9924                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9925   %}
 9926   ins_pipe( pipe_slow );
 9927 %}
 9928 
 9929 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9930   match(Set dst (MulVS (Binary dst src2) mask));
 9931   match(Set dst (MulVI (Binary dst src2) mask));
 9932   match(Set dst (MulVL (Binary dst src2) mask));
 9933   match(Set dst (MulVF (Binary dst src2) mask));
 9934   match(Set dst (MulVD (Binary dst src2) mask));
 9935   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9936   ins_encode %{
 9937     int vlen_enc = vector_length_encoding(this);
 9938     BasicType bt = Matcher::vector_element_basic_type(this);
 9939     int opc = this->ideal_Opcode();
 9940     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9941                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9942   %}
 9943   ins_pipe( pipe_slow );
 9944 %}
 9945 
 9946 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9947   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9948   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9949   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9950   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9951   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9952   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9953   ins_encode %{
 9954     int vlen_enc = vector_length_encoding(this);
 9955     BasicType bt = Matcher::vector_element_basic_type(this);
 9956     int opc = this->ideal_Opcode();
 9957     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9958                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9959   %}
 9960   ins_pipe( pipe_slow );
 9961 %}
 9962 
 9963 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9964   match(Set dst (SqrtVF dst mask));
 9965   match(Set dst (SqrtVD dst mask));
 9966   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9967   ins_encode %{
 9968     int vlen_enc = vector_length_encoding(this);
 9969     BasicType bt = Matcher::vector_element_basic_type(this);
 9970     int opc = this->ideal_Opcode();
 9971     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9972                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9973   %}
 9974   ins_pipe( pipe_slow );
 9975 %}
 9976 
 9977 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9978   match(Set dst (DivVF (Binary dst src2) mask));
 9979   match(Set dst (DivVD (Binary dst src2) mask));
 9980   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9981   ins_encode %{
 9982     int vlen_enc = vector_length_encoding(this);
 9983     BasicType bt = Matcher::vector_element_basic_type(this);
 9984     int opc = this->ideal_Opcode();
 9985     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9986                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9987   %}
 9988   ins_pipe( pipe_slow );
 9989 %}
 9990 
 9991 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9992   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9993   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9994   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9995   ins_encode %{
 9996     int vlen_enc = vector_length_encoding(this);
 9997     BasicType bt = Matcher::vector_element_basic_type(this);
 9998     int opc = this->ideal_Opcode();
 9999     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10000                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10001   %}
10002   ins_pipe( pipe_slow );
10003 %}
10004 
10005 
10006 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
10007   match(Set dst (RotateLeftV (Binary dst shift) mask));
10008   match(Set dst (RotateRightV (Binary dst shift) mask));
10009   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
10010   ins_encode %{
10011     int vlen_enc = vector_length_encoding(this);
10012     BasicType bt = Matcher::vector_element_basic_type(this);
10013     int opc = this->ideal_Opcode();
10014     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10015                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10016   %}
10017   ins_pipe( pipe_slow );
10018 %}
10019 
10020 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
10021   match(Set dst (RotateLeftV (Binary dst src2) mask));
10022   match(Set dst (RotateRightV (Binary dst src2) mask));
10023   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
10024   ins_encode %{
10025     int vlen_enc = vector_length_encoding(this);
10026     BasicType bt = Matcher::vector_element_basic_type(this);
10027     int opc = this->ideal_Opcode();
10028     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10029                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10030   %}
10031   ins_pipe( pipe_slow );
10032 %}
10033 
10034 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10035   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10036   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10037   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10038   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10039   ins_encode %{
10040     int vlen_enc = vector_length_encoding(this);
10041     BasicType bt = Matcher::vector_element_basic_type(this);
10042     int opc = this->ideal_Opcode();
10043     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10044                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10045   %}
10046   ins_pipe( pipe_slow );
10047 %}
10048 
10049 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10050   predicate(!n->as_ShiftV()->is_var_shift());
10051   match(Set dst (LShiftVS (Binary dst src2) mask));
10052   match(Set dst (LShiftVI (Binary dst src2) mask));
10053   match(Set dst (LShiftVL (Binary dst src2) mask));
10054   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10055   ins_encode %{
10056     int vlen_enc = vector_length_encoding(this);
10057     BasicType bt = Matcher::vector_element_basic_type(this);
10058     int opc = this->ideal_Opcode();
10059     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10060                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10061   %}
10062   ins_pipe( pipe_slow );
10063 %}
10064 
10065 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10066   predicate(n->as_ShiftV()->is_var_shift());
10067   match(Set dst (LShiftVS (Binary dst src2) mask));
10068   match(Set dst (LShiftVI (Binary dst src2) mask));
10069   match(Set dst (LShiftVL (Binary dst src2) mask));
10070   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10071   ins_encode %{
10072     int vlen_enc = vector_length_encoding(this);
10073     BasicType bt = Matcher::vector_element_basic_type(this);
10074     int opc = this->ideal_Opcode();
10075     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10076                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10077   %}
10078   ins_pipe( pipe_slow );
10079 %}
10080 
10081 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10082   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10083   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10084   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10085   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10086   ins_encode %{
10087     int vlen_enc = vector_length_encoding(this);
10088     BasicType bt = Matcher::vector_element_basic_type(this);
10089     int opc = this->ideal_Opcode();
10090     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10091                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10092   %}
10093   ins_pipe( pipe_slow );
10094 %}
10095 
10096 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10097   predicate(!n->as_ShiftV()->is_var_shift());
10098   match(Set dst (RShiftVS (Binary dst src2) mask));
10099   match(Set dst (RShiftVI (Binary dst src2) mask));
10100   match(Set dst (RShiftVL (Binary dst src2) mask));
10101   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10102   ins_encode %{
10103     int vlen_enc = vector_length_encoding(this);
10104     BasicType bt = Matcher::vector_element_basic_type(this);
10105     int opc = this->ideal_Opcode();
10106     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10107                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10108   %}
10109   ins_pipe( pipe_slow );
10110 %}
10111 
10112 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10113   predicate(n->as_ShiftV()->is_var_shift());
10114   match(Set dst (RShiftVS (Binary dst src2) mask));
10115   match(Set dst (RShiftVI (Binary dst src2) mask));
10116   match(Set dst (RShiftVL (Binary dst src2) mask));
10117   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10118   ins_encode %{
10119     int vlen_enc = vector_length_encoding(this);
10120     BasicType bt = Matcher::vector_element_basic_type(this);
10121     int opc = this->ideal_Opcode();
10122     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10123                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10124   %}
10125   ins_pipe( pipe_slow );
10126 %}
10127 
10128 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10129   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10130   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10131   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10132   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10133   ins_encode %{
10134     int vlen_enc = vector_length_encoding(this);
10135     BasicType bt = Matcher::vector_element_basic_type(this);
10136     int opc = this->ideal_Opcode();
10137     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10138                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10139   %}
10140   ins_pipe( pipe_slow );
10141 %}
10142 
10143 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10144   predicate(!n->as_ShiftV()->is_var_shift());
10145   match(Set dst (URShiftVS (Binary dst src2) mask));
10146   match(Set dst (URShiftVI (Binary dst src2) mask));
10147   match(Set dst (URShiftVL (Binary dst src2) mask));
10148   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10149   ins_encode %{
10150     int vlen_enc = vector_length_encoding(this);
10151     BasicType bt = Matcher::vector_element_basic_type(this);
10152     int opc = this->ideal_Opcode();
10153     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10154                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10155   %}
10156   ins_pipe( pipe_slow );
10157 %}
10158 
10159 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10160   predicate(n->as_ShiftV()->is_var_shift());
10161   match(Set dst (URShiftVS (Binary dst src2) mask));
10162   match(Set dst (URShiftVI (Binary dst src2) mask));
10163   match(Set dst (URShiftVL (Binary dst src2) mask));
10164   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10165   ins_encode %{
10166     int vlen_enc = vector_length_encoding(this);
10167     BasicType bt = Matcher::vector_element_basic_type(this);
10168     int opc = this->ideal_Opcode();
10169     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10170                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10171   %}
10172   ins_pipe( pipe_slow );
10173 %}
10174 
10175 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10176   match(Set dst (MaxV (Binary dst src2) mask));
10177   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10178   ins_encode %{
10179     int vlen_enc = vector_length_encoding(this);
10180     BasicType bt = Matcher::vector_element_basic_type(this);
10181     int opc = this->ideal_Opcode();
10182     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10183                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10184   %}
10185   ins_pipe( pipe_slow );
10186 %}
10187 
10188 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10189   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10190   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10191   ins_encode %{
10192     int vlen_enc = vector_length_encoding(this);
10193     BasicType bt = Matcher::vector_element_basic_type(this);
10194     int opc = this->ideal_Opcode();
10195     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10196                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10197   %}
10198   ins_pipe( pipe_slow );
10199 %}
10200 
10201 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10202   match(Set dst (MinV (Binary dst src2) mask));
10203   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10204   ins_encode %{
10205     int vlen_enc = vector_length_encoding(this);
10206     BasicType bt = Matcher::vector_element_basic_type(this);
10207     int opc = this->ideal_Opcode();
10208     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10209                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10210   %}
10211   ins_pipe( pipe_slow );
10212 %}
10213 
10214 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10215   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10216   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10217   ins_encode %{
10218     int vlen_enc = vector_length_encoding(this);
10219     BasicType bt = Matcher::vector_element_basic_type(this);
10220     int opc = this->ideal_Opcode();
10221     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10222                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10223   %}
10224   ins_pipe( pipe_slow );
10225 %}
10226 
10227 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10228   match(Set dst (VectorRearrange (Binary dst src2) mask));
10229   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10230   ins_encode %{
10231     int vlen_enc = vector_length_encoding(this);
10232     BasicType bt = Matcher::vector_element_basic_type(this);
10233     int opc = this->ideal_Opcode();
10234     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10235                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10236   %}
10237   ins_pipe( pipe_slow );
10238 %}
10239 
10240 instruct vabs_masked(vec dst, kReg mask) %{
10241   match(Set dst (AbsVB dst mask));
10242   match(Set dst (AbsVS dst mask));
10243   match(Set dst (AbsVI dst mask));
10244   match(Set dst (AbsVL dst mask));
10245   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10246   ins_encode %{
10247     int vlen_enc = vector_length_encoding(this);
10248     BasicType bt = Matcher::vector_element_basic_type(this);
10249     int opc = this->ideal_Opcode();
10250     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10251                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10252   %}
10253   ins_pipe( pipe_slow );
10254 %}
10255 
10256 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10257   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10258   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10259   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10260   ins_encode %{
10261     assert(UseFMA, "Needs FMA instructions support.");
10262     int vlen_enc = vector_length_encoding(this);
10263     BasicType bt = Matcher::vector_element_basic_type(this);
10264     int opc = this->ideal_Opcode();
10265     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10266                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10267   %}
10268   ins_pipe( pipe_slow );
10269 %}
10270 
10271 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10272   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10273   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10274   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10275   ins_encode %{
10276     assert(UseFMA, "Needs FMA instructions support.");
10277     int vlen_enc = vector_length_encoding(this);
10278     BasicType bt = Matcher::vector_element_basic_type(this);
10279     int opc = this->ideal_Opcode();
10280     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10281                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10282   %}
10283   ins_pipe( pipe_slow );
10284 %}
10285 
10286 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10287   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10288   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10289   ins_encode %{
10290     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10291     int vlen_enc = vector_length_encoding(this, $src1);
10292     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10293 
10294     // Comparison i
10295     switch (src1_elem_bt) {
10296       case T_BYTE: {
10297         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10298         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10299         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10300         break;
10301       }
10302       case T_SHORT: {
10303         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10304         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10305         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10306         break;
10307       }
10308       case T_INT: {
10309         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10310         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10311         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10312         break;
10313       }
10314       case T_LONG: {
10315         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10316         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10317         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10318         break;
10319       }
10320       case T_FLOAT: {
10321         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10322         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10323         break;
10324       }
10325       case T_DOUBLE: {
10326         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10327         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10328         break;
10329       }
10330       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10331     }
10332   %}
10333   ins_pipe( pipe_slow );
10334 %}
10335 
10336 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10337   predicate(Matcher::vector_length(n) <= 32);
10338   match(Set dst (MaskAll src));
10339   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10340   ins_encode %{
10341     int mask_len = Matcher::vector_length(this);
10342     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10343   %}
10344   ins_pipe( pipe_slow );
10345 %}
10346 
10347 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10348   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10349   match(Set dst (XorVMask src (MaskAll cnt)));
10350   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10351   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10352   ins_encode %{
10353     uint masklen = Matcher::vector_length(this);
10354     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10355   %}
10356   ins_pipe( pipe_slow );
10357 %}
10358 
10359 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10360   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10361             (Matcher::vector_length(n) == 16) ||
10362             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10363   match(Set dst (XorVMask src (MaskAll cnt)));
10364   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10365   ins_encode %{
10366     uint masklen = Matcher::vector_length(this);
10367     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10368   %}
10369   ins_pipe( pipe_slow );
10370 %}
10371 
10372 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10373   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10374   match(Set dst (VectorLongToMask src));
10375   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10376   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10377   ins_encode %{
10378     int mask_len = Matcher::vector_length(this);
10379     int vec_enc  = vector_length_encoding(mask_len);
10380     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10381                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10382   %}
10383   ins_pipe( pipe_slow );
10384 %}
10385 
10386 
10387 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10388   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10389   match(Set dst (VectorLongToMask src));
10390   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10391   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10392   ins_encode %{
10393     int mask_len = Matcher::vector_length(this);
10394     assert(mask_len <= 32, "invalid mask length");
10395     int vec_enc  = vector_length_encoding(mask_len);
10396     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10397                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10398   %}
10399   ins_pipe( pipe_slow );
10400 %}
10401 
10402 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10403   predicate(n->bottom_type()->isa_vectmask());
10404   match(Set dst (VectorLongToMask src));
10405   format %{ "long_to_mask_evex $dst, $src\t!" %}
10406   ins_encode %{
10407     __ kmov($dst$$KRegister, $src$$Register);
10408   %}
10409   ins_pipe( pipe_slow );
10410 %}
10411 
10412 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10413   match(Set dst (AndVMask src1 src2));
10414   match(Set dst (OrVMask src1 src2));
10415   match(Set dst (XorVMask src1 src2));
10416   effect(TEMP kscratch);
10417   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10418   ins_encode %{
10419     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10420     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10421     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10422     uint masklen = Matcher::vector_length(this);
10423     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10424     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10425   %}
10426   ins_pipe( pipe_slow );
10427 %}
10428 
10429 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10430   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10431   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10432   ins_encode %{
10433     int vlen_enc = vector_length_encoding(this);
10434     BasicType bt = Matcher::vector_element_basic_type(this);
10435     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10436                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10437   %}
10438   ins_pipe( pipe_slow );
10439 %}
10440 
10441 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10442   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10443   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10444   ins_encode %{
10445     int vlen_enc = vector_length_encoding(this);
10446     BasicType bt = Matcher::vector_element_basic_type(this);
10447     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10448                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10449   %}
10450   ins_pipe( pipe_slow );
10451 %}
10452 
10453 instruct castMM(kReg dst)
10454 %{
10455   match(Set dst (CastVV dst));
10456 
10457   size(0);
10458   format %{ "# castVV of $dst" %}
10459   ins_encode(/* empty encoding */);
10460   ins_cost(0);
10461   ins_pipe(empty);
10462 %}
10463 
10464 instruct castVV(vec dst)
10465 %{
10466   match(Set dst (CastVV dst));
10467 
10468   size(0);
10469   format %{ "# castVV of $dst" %}
10470   ins_encode(/* empty encoding */);
10471   ins_cost(0);
10472   ins_pipe(empty);
10473 %}
10474 
10475 instruct castVVLeg(legVec dst)
10476 %{
10477   match(Set dst (CastVV dst));
10478 
10479   size(0);
10480   format %{ "# castVV of $dst" %}
10481   ins_encode(/* empty encoding */);
10482   ins_cost(0);
10483   ins_pipe(empty);
10484 %}
10485 
10486 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10487 %{
10488   match(Set dst (IsInfiniteF src));
10489   effect(TEMP ktmp, KILL cr);
10490   format %{ "float_class_check $dst, $src" %}
10491   ins_encode %{
10492     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10493     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10494   %}
10495   ins_pipe(pipe_slow);
10496 %}
10497 
10498 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10499 %{
10500   match(Set dst (IsInfiniteD src));
10501   effect(TEMP ktmp, KILL cr);
10502   format %{ "double_class_check $dst, $src" %}
10503   ins_encode %{
10504     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10505     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10506   %}
10507   ins_pipe(pipe_slow);
10508 %}
10509 
10510 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10511 %{
10512   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10513             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10514   match(Set dst (SaturatingAddV src1 src2));
10515   match(Set dst (SaturatingSubV src1 src2));
10516   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10517   ins_encode %{
10518     int vlen_enc = vector_length_encoding(this);
10519     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10520     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10521                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10522   %}
10523   ins_pipe(pipe_slow);
10524 %}
10525 
10526 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10527 %{
10528   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10529             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10530   match(Set dst (SaturatingAddV src1 src2));
10531   match(Set dst (SaturatingSubV src1 src2));
10532   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10533   ins_encode %{
10534     int vlen_enc = vector_length_encoding(this);
10535     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10536     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10537                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10538   %}
10539   ins_pipe(pipe_slow);
10540 %}
10541 
10542 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10543 %{
10544   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10545             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10546             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10547   match(Set dst (SaturatingAddV src1 src2));
10548   match(Set dst (SaturatingSubV src1 src2));
10549   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10550   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10551   ins_encode %{
10552     int vlen_enc = vector_length_encoding(this);
10553     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10554     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10555                                         $src1$$XMMRegister, $src2$$XMMRegister,
10556                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10557                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10558   %}
10559   ins_pipe(pipe_slow);
10560 %}
10561 
10562 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10563 %{
10564   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10565             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10566             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10567   match(Set dst (SaturatingAddV src1 src2));
10568   match(Set dst (SaturatingSubV src1 src2));
10569   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10570   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10571   ins_encode %{
10572     int vlen_enc = vector_length_encoding(this);
10573     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10574     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10575                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10576                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10577   %}
10578   ins_pipe(pipe_slow);
10579 %}
10580 
10581 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10582 %{
10583   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10584             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10585             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10586   match(Set dst (SaturatingAddV src1 src2));
10587   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10588   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10589   ins_encode %{
10590     int vlen_enc = vector_length_encoding(this);
10591     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10592     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10593                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10594   %}
10595   ins_pipe(pipe_slow);
10596 %}
10597 
10598 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10599 %{
10600   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10601             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10602             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10603   match(Set dst (SaturatingAddV src1 src2));
10604   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10605   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10606   ins_encode %{
10607     int vlen_enc = vector_length_encoding(this);
10608     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10609     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10610                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10611   %}
10612   ins_pipe(pipe_slow);
10613 %}
10614 
10615 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10616 %{
10617   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10618             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10619             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10620   match(Set dst (SaturatingSubV src1 src2));
10621   effect(TEMP ktmp);
10622   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10623   ins_encode %{
10624     int vlen_enc = vector_length_encoding(this);
10625     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10626     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10627                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10628   %}
10629   ins_pipe(pipe_slow);
10630 %}
10631 
10632 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10633 %{
10634   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10635             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10636             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10637   match(Set dst (SaturatingSubV src1 src2));
10638   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10639   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10640   ins_encode %{
10641     int vlen_enc = vector_length_encoding(this);
10642     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10643     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10644                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10645   %}
10646   ins_pipe(pipe_slow);
10647 %}
10648 
10649 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10650 %{
10651   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10652             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10653   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10654   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10655   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10656   ins_encode %{
10657     int vlen_enc = vector_length_encoding(this);
10658     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10659     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10660                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10661   %}
10662   ins_pipe(pipe_slow);
10663 %}
10664 
10665 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10666 %{
10667   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10668             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10669   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10670   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10671   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10672   ins_encode %{
10673     int vlen_enc = vector_length_encoding(this);
10674     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10675     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10676                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10677   %}
10678   ins_pipe(pipe_slow);
10679 %}
10680 
10681 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10682   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10683             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10684   match(Set dst (SaturatingAddV (Binary dst src) mask));
10685   match(Set dst (SaturatingSubV (Binary dst src) mask));
10686   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10687   ins_encode %{
10688     int vlen_enc = vector_length_encoding(this);
10689     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10690     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10691                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10692   %}
10693   ins_pipe( pipe_slow );
10694 %}
10695 
10696 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10697   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10698             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10699   match(Set dst (SaturatingAddV (Binary dst src) mask));
10700   match(Set dst (SaturatingSubV (Binary dst src) mask));
10701   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10702   ins_encode %{
10703     int vlen_enc = vector_length_encoding(this);
10704     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10705     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10706                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10707   %}
10708   ins_pipe( pipe_slow );
10709 %}
10710 
10711 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10712   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10713             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10714   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10715   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10716   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10717   ins_encode %{
10718     int vlen_enc = vector_length_encoding(this);
10719     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10720     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10721                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10722   %}
10723   ins_pipe( pipe_slow );
10724 %}
10725 
10726 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10727   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10728             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10729   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10730   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10731   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10732   ins_encode %{
10733     int vlen_enc = vector_length_encoding(this);
10734     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10735     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10736                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10737   %}
10738   ins_pipe( pipe_slow );
10739 %}
10740 
10741 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10742 %{
10743   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10744   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10745   ins_encode %{
10746     int vlen_enc = vector_length_encoding(this);
10747     BasicType bt = Matcher::vector_element_basic_type(this);
10748     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10749   %}
10750   ins_pipe(pipe_slow);
10751 %}
10752 
10753 instruct reinterpretS2HF(regF dst, rRegI src)
10754 %{
10755   match(Set dst (ReinterpretS2HF src));
10756   format %{ "vmovw $dst, $src" %}
10757   ins_encode %{
10758     __ vmovw($dst$$XMMRegister, $src$$Register);
10759   %}
10760   ins_pipe(pipe_slow);
10761 %}
10762 
10763 instruct reinterpretHF2S(rRegI dst, regF src)
10764 %{
10765   match(Set dst (ReinterpretHF2S src));
10766   format %{ "vmovw $dst, $src" %}
10767   ins_encode %{
10768     __ vmovw($dst$$Register, $src$$XMMRegister);
10769   %}
10770   ins_pipe(pipe_slow);
10771 %}
10772 
10773 instruct convF2HFAndS2HF(regF dst, regF src)
10774 %{
10775   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10776   format %{ "convF2HFAndS2HF $dst, $src" %}
10777   ins_encode %{
10778     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10779   %}
10780   ins_pipe(pipe_slow);
10781 %}
10782 
10783 instruct convHF2SAndHF2F(regF dst, regF src)
10784 %{
10785   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10786   format %{ "convHF2SAndHF2F $dst, $src" %}
10787   ins_encode %{
10788     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10789   %}
10790   ins_pipe(pipe_slow);
10791 %}
10792 
10793 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10794 %{
10795   match(Set dst (SqrtHF src));
10796   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10797   ins_encode %{
10798     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10799   %}
10800   ins_pipe(pipe_slow);
10801 %}
10802 
10803 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10804 %{
10805   match(Set dst (AddHF src1 src2));
10806   match(Set dst (DivHF src1 src2));
10807   match(Set dst (MulHF src1 src2));
10808   match(Set dst (SubHF src1 src2));
10809   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10810   ins_encode %{
10811     int opcode = this->ideal_Opcode();
10812     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10813   %}
10814   ins_pipe(pipe_slow);
10815 %}
10816 
10817 instruct scalar_minmax_HF_avx10_reg(regF dst, regF src1, regF src2)
10818 %{
10819   predicate(VM_Version::supports_avx10_2());
10820   match(Set dst (MaxHF src1 src2));
10821   match(Set dst (MinHF src1 src2));
10822   format %{ "scalar_min_max_fp16 $dst, $src1, $src2" %}
10823   ins_encode %{
10824     int function = this->ideal_Opcode() == Op_MinHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10825     __ eminmaxsh($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, function);
10826   %}
10827   ins_pipe( pipe_slow );
10828 %}
10829 
10830 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10831 %{
10832   predicate(!VM_Version::supports_avx10_2());
10833   match(Set dst (MaxHF src1 src2));
10834   match(Set dst (MinHF src1 src2));
10835   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10836   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10837   ins_encode %{
10838     int opcode = this->ideal_Opcode();
10839     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10840                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10841   %}
10842   ins_pipe( pipe_slow );
10843 %}
10844 
10845 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10846 %{
10847   match(Set dst (FmaHF  src2 (Binary dst src1)));
10848   effect(DEF dst);
10849   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10850   ins_encode %{
10851     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10852   %}
10853   ins_pipe( pipe_slow );
10854 %}
10855 
10856 
10857 instruct vector_sqrt_HF_reg(vec dst, vec src)
10858 %{
10859   match(Set dst (SqrtVHF src));
10860   format %{ "vector_sqrt_fp16 $dst, $src" %}
10861   ins_encode %{
10862     int vlen_enc = vector_length_encoding(this);
10863     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10864   %}
10865   ins_pipe(pipe_slow);
10866 %}
10867 
10868 instruct vector_sqrt_HF_mem(vec dst, memory src)
10869 %{
10870   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10871   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10872   ins_encode %{
10873     int vlen_enc = vector_length_encoding(this);
10874     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10875   %}
10876   ins_pipe(pipe_slow);
10877 %}
10878 
10879 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10880 %{
10881   match(Set dst (AddVHF src1 src2));
10882   match(Set dst (DivVHF src1 src2));
10883   match(Set dst (MulVHF src1 src2));
10884   match(Set dst (SubVHF src1 src2));
10885   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10886   ins_encode %{
10887     int vlen_enc = vector_length_encoding(this);
10888     int opcode = this->ideal_Opcode();
10889     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10890   %}
10891   ins_pipe(pipe_slow);
10892 %}
10893 
10894 
10895 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10896 %{
10897   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10898   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10899   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10900   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10901   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10902   ins_encode %{
10903     int vlen_enc = vector_length_encoding(this);
10904     int opcode = this->ideal_Opcode();
10905     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10906   %}
10907   ins_pipe(pipe_slow);
10908 %}
10909 
10910 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10911 %{
10912   match(Set dst (FmaVHF src2 (Binary dst src1)));
10913   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10914   ins_encode %{
10915     int vlen_enc = vector_length_encoding(this);
10916     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10917   %}
10918   ins_pipe( pipe_slow );
10919 %}
10920 
10921 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10922 %{
10923   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10924   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10925   ins_encode %{
10926     int vlen_enc = vector_length_encoding(this);
10927     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10928   %}
10929   ins_pipe( pipe_slow );
10930 %}
10931 
10932 instruct vector_minmax_HF_avx10_mem(vec dst, vec src1, memory src2)
10933 %{
10934   predicate(VM_Version::supports_avx10_2());
10935   match(Set dst (MinVHF src1 (VectorReinterpret (LoadVector src2))));
10936   match(Set dst (MaxVHF src1 (VectorReinterpret (LoadVector src2))));
10937   format %{ "vector_min_max_fp16_mem $dst, $src1, $src2" %}
10938   ins_encode %{
10939     int vlen_enc = vector_length_encoding(this);
10940     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10941     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$Address, true, function, vlen_enc);
10942   %}
10943   ins_pipe( pipe_slow );
10944 %}
10945 
10946 instruct vector_minmax_HF_avx10_reg(vec dst, vec src1, vec src2)
10947 %{
10948   predicate(VM_Version::supports_avx10_2());
10949   match(Set dst (MinVHF src1 src2));
10950   match(Set dst (MaxVHF src1 src2));
10951   format %{ "vector_min_max_fp16 $dst, $src1, $src2" %}
10952   ins_encode %{
10953     int vlen_enc = vector_length_encoding(this);
10954     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10955     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, true, function, vlen_enc);
10956   %}
10957   ins_pipe( pipe_slow );
10958 %}
10959 
10960 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
10961 %{
10962   predicate(!VM_Version::supports_avx10_2());
10963   match(Set dst (MinVHF src1 src2));
10964   match(Set dst (MaxVHF src1 src2));
10965   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10966   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10967   ins_encode %{
10968     int vlen_enc = vector_length_encoding(this);
10969     int opcode = this->ideal_Opcode();
10970     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
10971                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10972   %}
10973   ins_pipe( pipe_slow );
10974 %}