1 //
    2 // Copyright (c) 2011, 2025, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  214 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  215 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  216 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  217 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  218 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  219 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  220 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  221 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  222 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  223 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  224 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  225 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  226 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  227 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  228 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  229 
  230 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  231 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  232 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  233 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  234 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  235 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  236 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  237 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  238 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  239 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  240 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  241 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  242 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  243 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  244 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  245 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  246 
  247 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  248 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  249 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  250 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  251 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  252 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  253 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  254 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  255 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  256 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  257 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  258 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  259 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  260 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  261 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  262 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  263 
  264 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  265 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  266 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  267 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  268 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  269 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  270 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  271 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  272 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  273 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  274 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  275 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  276 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  277 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  278 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  279 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  280 
  281 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  282 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  283 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  284 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  285 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  286 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  287 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  288 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  289 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  290 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  291 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  292 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  293 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  294 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  295 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  296 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  297 
  298 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  299 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  300 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  301 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  302 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  303 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  304 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  305 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  306 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  307 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  308 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  309 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  310 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  311 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  312 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  313 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  314 
  315 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  316 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  317 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  318 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  319 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  320 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  321 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  322 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  323 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  324 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  325 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  326 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  327 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  328 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  329 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  330 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  331 
  332 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  333 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  334 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  335 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  336 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  337 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  338 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  339 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  340 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  341 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  342 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  343 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  344 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  345 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  346 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  347 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  348 
  349 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  350 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  351 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  352 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  353 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  354 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  355 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  356 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  357 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  358 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  359 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  360 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  361 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  362 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  363 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  364 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  365 
  366 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  367 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  368 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  369 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  370 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  371 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  372 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  373 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  374 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  375 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  376 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  377 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  378 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  379 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  380 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  381 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  382 
  383 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  384 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  385 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  386 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  387 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  388 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  389 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  390 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  391 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  392 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  393 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  394 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  395 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  396 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  397 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  398 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  399 
  400 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  401 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  402 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  403 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  404 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  405 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  406 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  407 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  408 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  409 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  410 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  411 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  412 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  413 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  414 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  415 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  416 
  417 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  418 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  419 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  420 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  421 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  422 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  423 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  424 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  425 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  426 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  427 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  428 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  429 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  430 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  431 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  432 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  433 
  434 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  435 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  436 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  437 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  438 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  439 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  440 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  441 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  442 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  443 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  444 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  445 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  446 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  447 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  448 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  449 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  450 
  451 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  452 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  453 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  454 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  455 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  456 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  457 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  458 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  459 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  460 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  461 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  462 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  463 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  464 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  465 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  466 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  467 
  468 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  469 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  470 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  471 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  472 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  473 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  474 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  475 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  476 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  477 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  478 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  479 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  480 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  481 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  482 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  483 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  484 
  485 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  486 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  487 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  488 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  489 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  490 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  491 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  492 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  493 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  494 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  495 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  496 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  497 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  498 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  499 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  500 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  501 
  502 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  503 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  504 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  505 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  506 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  507 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  508 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  509 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  510 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  511 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  512 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  513 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  514 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  515 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  516 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  517 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  518 
  519 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  520 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  521 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  522 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  523 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  524 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  525 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  526 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  527 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  528 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  529 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  530 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  531 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  532 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  533 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  534 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  535 
  536 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  537 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  538 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  539 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  540 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  541 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  542 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  543 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  544 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  545 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  546 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  547 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  548 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  549 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  550 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  551 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  552 
  553 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  554 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  555 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  556 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  557 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  558 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  559 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  560 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  561 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  562 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  563 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  564 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  565 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  566 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  567 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  568 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  569 
  570 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  571 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  572 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  573 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  574 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  575 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  576 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  577 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  578 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  579 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  580 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  581 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  582 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  583 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  584 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  585 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  586 
  587 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  588 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  589 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  590 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  591 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  592 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  593 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  594 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  595 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  596 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  597 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  598 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  599 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  600 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  601 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  602 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  603 
  604 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  605 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  606 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  607 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  608 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  609 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  610 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  611 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  612 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  613 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  614 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  615 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  616 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  617 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  618 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  619 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  620 
  621 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  622 
  623 // AVX3 Mask Registers.
  624 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  625 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  626 
  627 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  628 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  629 
  630 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  631 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  632 
  633 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  634 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  635 
  636 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  637 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  638 
  639 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  640 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  641 
  642 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  643 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  644 
  645 
  646 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  647                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  648                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  649                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  650                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  651                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  652                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  653                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
  654                    XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  655                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  656                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  657                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  658                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  659                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  660                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  661                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
  662                    XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  663                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  664                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  665                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  666                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  667                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  668                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  669                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  670                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  671                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  672                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  673                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  674                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  675                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  676                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  677                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
  678 
  679 alloc_class chunk2(K7, K7_H,
  680                    K6, K6_H,
  681                    K5, K5_H,
  682                    K4, K4_H,
  683                    K3, K3_H,
  684                    K2, K2_H,
  685                    K1, K1_H);
  686 
  687 reg_class  vectmask_reg(K1, K1_H,
  688                         K2, K2_H,
  689                         K3, K3_H,
  690                         K4, K4_H,
  691                         K5, K5_H,
  692                         K6, K6_H,
  693                         K7, K7_H);
  694 
  695 reg_class vectmask_reg_K1(K1, K1_H);
  696 reg_class vectmask_reg_K2(K2, K2_H);
  697 reg_class vectmask_reg_K3(K3, K3_H);
  698 reg_class vectmask_reg_K4(K4, K4_H);
  699 reg_class vectmask_reg_K5(K5, K5_H);
  700 reg_class vectmask_reg_K6(K6, K6_H);
  701 reg_class vectmask_reg_K7(K7, K7_H);
  702 
  703 // flags allocation class should be last.
  704 alloc_class chunk3(RFLAGS);
  705 
  706 
  707 // Singleton class for condition codes
  708 reg_class int_flags(RFLAGS);
  709 
  710 // Class for pre evex float registers
  711 reg_class float_reg_legacy(XMM0,
  712                     XMM1,
  713                     XMM2,
  714                     XMM3,
  715                     XMM4,
  716                     XMM5,
  717                     XMM6,
  718                     XMM7,
  719                     XMM8,
  720                     XMM9,
  721                     XMM10,
  722                     XMM11,
  723                     XMM12,
  724                     XMM13,
  725                     XMM14,
  726                     XMM15);
  727 
  728 // Class for evex float registers
  729 reg_class float_reg_evex(XMM0,
  730                     XMM1,
  731                     XMM2,
  732                     XMM3,
  733                     XMM4,
  734                     XMM5,
  735                     XMM6,
  736                     XMM7,
  737                     XMM8,
  738                     XMM9,
  739                     XMM10,
  740                     XMM11,
  741                     XMM12,
  742                     XMM13,
  743                     XMM14,
  744                     XMM15,
  745                     XMM16,
  746                     XMM17,
  747                     XMM18,
  748                     XMM19,
  749                     XMM20,
  750                     XMM21,
  751                     XMM22,
  752                     XMM23,
  753                     XMM24,
  754                     XMM25,
  755                     XMM26,
  756                     XMM27,
  757                     XMM28,
  758                     XMM29,
  759                     XMM30,
  760                     XMM31);
  761 
  762 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  763 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  764 
  765 // Class for pre evex double registers
  766 reg_class double_reg_legacy(XMM0,  XMM0b,
  767                      XMM1,  XMM1b,
  768                      XMM2,  XMM2b,
  769                      XMM3,  XMM3b,
  770                      XMM4,  XMM4b,
  771                      XMM5,  XMM5b,
  772                      XMM6,  XMM6b,
  773                      XMM7,  XMM7b,
  774                      XMM8,  XMM8b,
  775                      XMM9,  XMM9b,
  776                      XMM10, XMM10b,
  777                      XMM11, XMM11b,
  778                      XMM12, XMM12b,
  779                      XMM13, XMM13b,
  780                      XMM14, XMM14b,
  781                      XMM15, XMM15b);
  782 
  783 // Class for evex double registers
  784 reg_class double_reg_evex(XMM0,  XMM0b,
  785                      XMM1,  XMM1b,
  786                      XMM2,  XMM2b,
  787                      XMM3,  XMM3b,
  788                      XMM4,  XMM4b,
  789                      XMM5,  XMM5b,
  790                      XMM6,  XMM6b,
  791                      XMM7,  XMM7b,
  792                      XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b,
  800                      XMM16, XMM16b,
  801                      XMM17, XMM17b,
  802                      XMM18, XMM18b,
  803                      XMM19, XMM19b,
  804                      XMM20, XMM20b,
  805                      XMM21, XMM21b,
  806                      XMM22, XMM22b,
  807                      XMM23, XMM23b,
  808                      XMM24, XMM24b,
  809                      XMM25, XMM25b,
  810                      XMM26, XMM26b,
  811                      XMM27, XMM27b,
  812                      XMM28, XMM28b,
  813                      XMM29, XMM29b,
  814                      XMM30, XMM30b,
  815                      XMM31, XMM31b);
  816 
  817 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  818 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  819 
  820 // Class for pre evex 32bit vector registers
  821 reg_class vectors_reg_legacy(XMM0,
  822                       XMM1,
  823                       XMM2,
  824                       XMM3,
  825                       XMM4,
  826                       XMM5,
  827                       XMM6,
  828                       XMM7,
  829                       XMM8,
  830                       XMM9,
  831                       XMM10,
  832                       XMM11,
  833                       XMM12,
  834                       XMM13,
  835                       XMM14,
  836                       XMM15);
  837 
  838 // Class for evex 32bit vector registers
  839 reg_class vectors_reg_evex(XMM0,
  840                       XMM1,
  841                       XMM2,
  842                       XMM3,
  843                       XMM4,
  844                       XMM5,
  845                       XMM6,
  846                       XMM7,
  847                       XMM8,
  848                       XMM9,
  849                       XMM10,
  850                       XMM11,
  851                       XMM12,
  852                       XMM13,
  853                       XMM14,
  854                       XMM15,
  855                       XMM16,
  856                       XMM17,
  857                       XMM18,
  858                       XMM19,
  859                       XMM20,
  860                       XMM21,
  861                       XMM22,
  862                       XMM23,
  863                       XMM24,
  864                       XMM25,
  865                       XMM26,
  866                       XMM27,
  867                       XMM28,
  868                       XMM29,
  869                       XMM30,
  870                       XMM31);
  871 
  872 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  873 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  874 
  875 // Class for all 64bit vector registers
  876 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  877                       XMM1,  XMM1b,
  878                       XMM2,  XMM2b,
  879                       XMM3,  XMM3b,
  880                       XMM4,  XMM4b,
  881                       XMM5,  XMM5b,
  882                       XMM6,  XMM6b,
  883                       XMM7,  XMM7b,
  884                       XMM8,  XMM8b,
  885                       XMM9,  XMM9b,
  886                       XMM10, XMM10b,
  887                       XMM11, XMM11b,
  888                       XMM12, XMM12b,
  889                       XMM13, XMM13b,
  890                       XMM14, XMM14b,
  891                       XMM15, XMM15b);
  892 
  893 // Class for all 64bit vector registers
  894 reg_class vectord_reg_evex(XMM0,  XMM0b,
  895                       XMM1,  XMM1b,
  896                       XMM2,  XMM2b,
  897                       XMM3,  XMM3b,
  898                       XMM4,  XMM4b,
  899                       XMM5,  XMM5b,
  900                       XMM6,  XMM6b,
  901                       XMM7,  XMM7b,
  902                       XMM8,  XMM8b,
  903                       XMM9,  XMM9b,
  904                       XMM10, XMM10b,
  905                       XMM11, XMM11b,
  906                       XMM12, XMM12b,
  907                       XMM13, XMM13b,
  908                       XMM14, XMM14b,
  909                       XMM15, XMM15b,
  910                       XMM16, XMM16b,
  911                       XMM17, XMM17b,
  912                       XMM18, XMM18b,
  913                       XMM19, XMM19b,
  914                       XMM20, XMM20b,
  915                       XMM21, XMM21b,
  916                       XMM22, XMM22b,
  917                       XMM23, XMM23b,
  918                       XMM24, XMM24b,
  919                       XMM25, XMM25b,
  920                       XMM26, XMM26b,
  921                       XMM27, XMM27b,
  922                       XMM28, XMM28b,
  923                       XMM29, XMM29b,
  924                       XMM30, XMM30b,
  925                       XMM31, XMM31b);
  926 
  927 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  928 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  929 
  930 // Class for all 128bit vector registers
  931 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  932                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  933                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  934                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  935                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  936                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  937                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  938                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  939                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  940                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  941                       XMM10, XMM10b, XMM10c, XMM10d,
  942                       XMM11, XMM11b, XMM11c, XMM11d,
  943                       XMM12, XMM12b, XMM12c, XMM12d,
  944                       XMM13, XMM13b, XMM13c, XMM13d,
  945                       XMM14, XMM14b, XMM14c, XMM14d,
  946                       XMM15, XMM15b, XMM15c, XMM15d);
  947 
  948 // Class for all 128bit vector registers
  949 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  950                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  951                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  952                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  953                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  954                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  955                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  956                       XMM7,  XMM7b,  XMM7c,  XMM7d,
  957                       XMM8,  XMM8b,  XMM8c,  XMM8d,
  958                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  959                       XMM10, XMM10b, XMM10c, XMM10d,
  960                       XMM11, XMM11b, XMM11c, XMM11d,
  961                       XMM12, XMM12b, XMM12c, XMM12d,
  962                       XMM13, XMM13b, XMM13c, XMM13d,
  963                       XMM14, XMM14b, XMM14c, XMM14d,
  964                       XMM15, XMM15b, XMM15c, XMM15d,
  965                       XMM16, XMM16b, XMM16c, XMM16d,
  966                       XMM17, XMM17b, XMM17c, XMM17d,
  967                       XMM18, XMM18b, XMM18c, XMM18d,
  968                       XMM19, XMM19b, XMM19c, XMM19d,
  969                       XMM20, XMM20b, XMM20c, XMM20d,
  970                       XMM21, XMM21b, XMM21c, XMM21d,
  971                       XMM22, XMM22b, XMM22c, XMM22d,
  972                       XMM23, XMM23b, XMM23c, XMM23d,
  973                       XMM24, XMM24b, XMM24c, XMM24d,
  974                       XMM25, XMM25b, XMM25c, XMM25d,
  975                       XMM26, XMM26b, XMM26c, XMM26d,
  976                       XMM27, XMM27b, XMM27c, XMM27d,
  977                       XMM28, XMM28b, XMM28c, XMM28d,
  978                       XMM29, XMM29b, XMM29c, XMM29d,
  979                       XMM30, XMM30b, XMM30c, XMM30d,
  980                       XMM31, XMM31b, XMM31c, XMM31d);
  981 
  982 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
  983 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  984 
  985 // Class for all 256bit vector registers
  986 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
  987                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
  988                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
  989                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
  990                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
  991                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
  992                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
  993                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
  994                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
  995                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
  996                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
  997                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
  998                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
  999                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1000                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1001                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
 1002 
 1003 // Class for all 256bit vector registers
 1004 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1005                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1006                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1007                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1008                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1009                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1010                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1011                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,
 1012                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1013                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1014                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1015                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1016                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1017                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1018                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1019                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1020                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1021                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1022                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1023                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1024                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1025                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1026                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1027                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1028                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1029                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1030                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1031                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1032                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1033                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1034                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1035                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
 1036 
 1037 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1038 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1039 
 1040 // Class for all 512bit vector registers
 1041 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1042                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1043                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1044                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1045                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1046                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1047                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1048                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1049                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p,
 1057                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
 1073 
 1074 // Class for restricted 512bit vector registers
 1075 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1076                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1077                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1078                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1079                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1080                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1081                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1082                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p,
 1083                       XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1084                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1085                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1086                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1087                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1088                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1089                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1090                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
 1091 
 1092 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1093 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1094 
 1095 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1096 %}
 1097 
 1098 
 1099 //----------SOURCE BLOCK-------------------------------------------------------
 1100 // This is a block of C++ code which provides values, functions, and
 1101 // definitions necessary in the rest of the architecture description
 1102 
 1103 source_hpp %{
 1104 // Header information of the source block.
 1105 // Method declarations/definitions which are used outside
 1106 // the ad-scope can conveniently be defined here.
 1107 //
 1108 // To keep related declarations/definitions/uses close together,
 1109 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1110 
 1111 #include "runtime/vm_version.hpp"
 1112 
 1113 class NativeJump;
 1114 
 1115 class CallStubImpl {
 1116 
 1117   //--------------------------------------------------------------
 1118   //---<  Used for optimization in Compile::shorten_branches  >---
 1119   //--------------------------------------------------------------
 1120 
 1121  public:
 1122   // Size of call trampoline stub.
 1123   static uint size_call_trampoline() {
 1124     return 0; // no call trampolines on this platform
 1125   }
 1126 
 1127   // number of relocations needed by a call trampoline stub
 1128   static uint reloc_call_trampoline() {
 1129     return 0; // no call trampolines on this platform
 1130   }
 1131 };
 1132 
 1133 class HandlerImpl {
 1134 
 1135  public:
 1136 
 1137   static int emit_exception_handler(C2_MacroAssembler *masm);
 1138   static int emit_deopt_handler(C2_MacroAssembler* masm);
 1139 
 1140   static uint size_exception_handler() {
 1141     // NativeCall instruction size is the same as NativeJump.
 1142     // exception handler starts out as jump and can be patched to
 1143     // a call be deoptimization.  (4932387)
 1144     // Note that this value is also credited (in output.cpp) to
 1145     // the size of the code section.
 1146     return NativeJump::instruction_size;
 1147   }
 1148 
 1149   static uint size_deopt_handler() {
 1150     // three 5 byte instructions plus one move for unreachable address.
 1151     return 15+3;
 1152   }
 1153 };
 1154 
 1155 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1156   switch(bytes) {
 1157     case  4: // fall-through
 1158     case  8: // fall-through
 1159     case 16: return Assembler::AVX_128bit;
 1160     case 32: return Assembler::AVX_256bit;
 1161     case 64: return Assembler::AVX_512bit;
 1162 
 1163     default: {
 1164       ShouldNotReachHere();
 1165       return Assembler::AVX_NoVec;
 1166     }
 1167   }
 1168 }
 1169 
 1170 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1171   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1172 }
 1173 
 1174 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1175   uint def_idx = use->operand_index(opnd);
 1176   Node* def = use->in(def_idx);
 1177   return vector_length_encoding(def);
 1178 }
 1179 
 1180 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1181   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1182          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1183 }
 1184 
 1185 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1186   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1187            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1188 }
 1189 
 1190 class Node::PD {
 1191 public:
 1192   enum NodeFlags {
 1193     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1194     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1195     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1196     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1197     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1198     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1199     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1200     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1201     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1202     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1203     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1204     _last_flag                = Flag_clears_sign_flag
 1205   };
 1206 };
 1207 
 1208 %} // end source_hpp
 1209 
 1210 source %{
 1211 
 1212 #include "opto/addnode.hpp"
 1213 #include "c2_intelJccErratum_x86.hpp"
 1214 
 1215 void PhaseOutput::pd_perform_mach_node_analysis() {
 1216   if (VM_Version::has_intel_jcc_erratum()) {
 1217     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1218     _buf_sizes._code += extra_padding;
 1219   }
 1220 }
 1221 
 1222 int MachNode::pd_alignment_required() const {
 1223   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1224     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1225     return IntelJccErratum::largest_jcc_size() + 1;
 1226   } else {
 1227     return 1;
 1228   }
 1229 }
 1230 
 1231 int MachNode::compute_padding(int current_offset) const {
 1232   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1233     Compile* C = Compile::current();
 1234     PhaseOutput* output = C->output();
 1235     Block* block = output->block();
 1236     int index = output->index();
 1237     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1238   } else {
 1239     return 0;
 1240   }
 1241 }
 1242 
 1243 // Emit exception handler code.
 1244 // Stuff framesize into a register and call a VM stub routine.
 1245 int HandlerImpl::emit_exception_handler(C2_MacroAssembler* masm) {
 1246 
 1247   // Note that the code buffer's insts_mark is always relative to insts.
 1248   // That's why we must use the macroassembler to generate a handler.
 1249   address base = __ start_a_stub(size_exception_handler());
 1250   if (base == nullptr) {
 1251     ciEnv::current()->record_failure("CodeCache is full");
 1252     return 0;  // CodeBuffer::expand failed
 1253   }
 1254   int offset = __ offset();
 1255   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1256   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1257   __ end_a_stub();
 1258   return offset;
 1259 }
 1260 
 1261 // Emit deopt handler code.
 1262 int HandlerImpl::emit_deopt_handler(C2_MacroAssembler* masm) {
 1263 
 1264   // Note that the code buffer's insts_mark is always relative to insts.
 1265   // That's why we must use the macroassembler to generate a handler.
 1266   address base = __ start_a_stub(size_deopt_handler());
 1267   if (base == nullptr) {
 1268     ciEnv::current()->record_failure("CodeCache is full");
 1269     return 0;  // CodeBuffer::expand failed
 1270   }
 1271   int offset = __ offset();
 1272 
 1273   address the_pc = (address) __ pc();
 1274   Label next;
 1275   // push a "the_pc" on the stack without destroying any registers
 1276   // as they all may be live.
 1277 
 1278   // push address of "next"
 1279   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1280   __ bind(next);
 1281   // adjust it so it matches "the_pc"
 1282   __ subptr(Address(rsp, 0), __ offset() - offset);
 1283 
 1284   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1285   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1286   __ end_a_stub();
 1287   return offset;
 1288 }
 1289 
 1290 static Assembler::Width widthForType(BasicType bt) {
 1291   if (bt == T_BYTE) {
 1292     return Assembler::B;
 1293   } else if (bt == T_SHORT) {
 1294     return Assembler::W;
 1295   } else if (bt == T_INT) {
 1296     return Assembler::D;
 1297   } else {
 1298     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1299     return Assembler::Q;
 1300   }
 1301 }
 1302 
 1303 //=============================================================================
 1304 
 1305   // Float masks come from different places depending on platform.
 1306   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1307   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1308   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1309   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1310   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1311   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1312   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1313   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1314   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1315   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1316   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1317   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1318   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1319   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1320   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1321   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1322   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1323   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1324   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1325 
 1326 //=============================================================================
 1327 bool Matcher::match_rule_supported(int opcode) {
 1328   if (!has_match_rule(opcode)) {
 1329     return false; // no match rule present
 1330   }
 1331   switch (opcode) {
 1332     case Op_AbsVL:
 1333     case Op_StoreVectorScatter:
 1334       if (UseAVX < 3) {
 1335         return false;
 1336       }
 1337       break;
 1338     case Op_PopCountI:
 1339     case Op_PopCountL:
 1340       if (!UsePopCountInstruction) {
 1341         return false;
 1342       }
 1343       break;
 1344     case Op_PopCountVI:
 1345       if (UseAVX < 2) {
 1346         return false;
 1347       }
 1348       break;
 1349     case Op_CompressV:
 1350     case Op_ExpandV:
 1351     case Op_PopCountVL:
 1352       if (UseAVX < 2) {
 1353         return false;
 1354       }
 1355       break;
 1356     case Op_MulVI:
 1357       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1358         return false;
 1359       }
 1360       break;
 1361     case Op_MulVL:
 1362       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1363         return false;
 1364       }
 1365       break;
 1366     case Op_MulReductionVL:
 1367       if (VM_Version::supports_avx512dq() == false) {
 1368         return false;
 1369       }
 1370       break;
 1371     case Op_AbsVB:
 1372     case Op_AbsVS:
 1373     case Op_AbsVI:
 1374     case Op_AddReductionVI:
 1375     case Op_AndReductionV:
 1376     case Op_OrReductionV:
 1377     case Op_XorReductionV:
 1378       if (UseSSE < 3) { // requires at least SSSE3
 1379         return false;
 1380       }
 1381       break;
 1382     case Op_MaxHF:
 1383     case Op_MinHF:
 1384       if (!VM_Version::supports_avx512vlbw()) {
 1385         return false;
 1386       }  // fallthrough
 1387     case Op_AddHF:
 1388     case Op_DivHF:
 1389     case Op_FmaHF:
 1390     case Op_MulHF:
 1391     case Op_ReinterpretS2HF:
 1392     case Op_ReinterpretHF2S:
 1393     case Op_SubHF:
 1394     case Op_SqrtHF:
 1395       if (!VM_Version::supports_avx512_fp16()) {
 1396         return false;
 1397       }
 1398       break;
 1399     case Op_VectorLoadShuffle:
 1400     case Op_VectorRearrange:
 1401     case Op_MulReductionVI:
 1402       if (UseSSE < 4) { // requires at least SSE4
 1403         return false;
 1404       }
 1405       break;
 1406     case Op_IsInfiniteF:
 1407     case Op_IsInfiniteD:
 1408       if (!VM_Version::supports_avx512dq()) {
 1409         return false;
 1410       }
 1411       break;
 1412     case Op_SqrtVD:
 1413     case Op_SqrtVF:
 1414     case Op_VectorMaskCmp:
 1415     case Op_VectorCastB2X:
 1416     case Op_VectorCastS2X:
 1417     case Op_VectorCastI2X:
 1418     case Op_VectorCastL2X:
 1419     case Op_VectorCastF2X:
 1420     case Op_VectorCastD2X:
 1421     case Op_VectorUCastB2X:
 1422     case Op_VectorUCastS2X:
 1423     case Op_VectorUCastI2X:
 1424     case Op_VectorMaskCast:
 1425       if (UseAVX < 1) { // enabled for AVX only
 1426         return false;
 1427       }
 1428       break;
 1429     case Op_PopulateIndex:
 1430       if (UseAVX < 2) {
 1431         return false;
 1432       }
 1433       break;
 1434     case Op_RoundVF:
 1435       if (UseAVX < 2) { // enabled for AVX2 only
 1436         return false;
 1437       }
 1438       break;
 1439     case Op_RoundVD:
 1440       if (UseAVX < 3) {
 1441         return false;  // enabled for AVX3 only
 1442       }
 1443       break;
 1444     case Op_CompareAndSwapL:
 1445     case Op_CompareAndSwapP:
 1446       break;
 1447     case Op_StrIndexOf:
 1448       if (!UseSSE42Intrinsics) {
 1449         return false;
 1450       }
 1451       break;
 1452     case Op_StrIndexOfChar:
 1453       if (!UseSSE42Intrinsics) {
 1454         return false;
 1455       }
 1456       break;
 1457     case Op_OnSpinWait:
 1458       if (VM_Version::supports_on_spin_wait() == false) {
 1459         return false;
 1460       }
 1461       break;
 1462     case Op_MulVB:
 1463     case Op_LShiftVB:
 1464     case Op_RShiftVB:
 1465     case Op_URShiftVB:
 1466     case Op_VectorInsert:
 1467     case Op_VectorLoadMask:
 1468     case Op_VectorStoreMask:
 1469     case Op_VectorBlend:
 1470       if (UseSSE < 4) {
 1471         return false;
 1472       }
 1473       break;
 1474     case Op_MaxD:
 1475     case Op_MaxF:
 1476     case Op_MinD:
 1477     case Op_MinF:
 1478       if (UseAVX < 1) { // enabled for AVX only
 1479         return false;
 1480       }
 1481       break;
 1482     case Op_CacheWB:
 1483     case Op_CacheWBPreSync:
 1484     case Op_CacheWBPostSync:
 1485       if (!VM_Version::supports_data_cache_line_flush()) {
 1486         return false;
 1487       }
 1488       break;
 1489     case Op_ExtractB:
 1490     case Op_ExtractL:
 1491     case Op_ExtractI:
 1492     case Op_RoundDoubleMode:
 1493       if (UseSSE < 4) {
 1494         return false;
 1495       }
 1496       break;
 1497     case Op_RoundDoubleModeV:
 1498       if (VM_Version::supports_avx() == false) {
 1499         return false; // 128bit vroundpd is not available
 1500       }
 1501       break;
 1502     case Op_LoadVectorGather:
 1503     case Op_LoadVectorGatherMasked:
 1504       if (UseAVX < 2) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_FmaF:
 1509     case Op_FmaD:
 1510     case Op_FmaVD:
 1511     case Op_FmaVF:
 1512       if (!UseFMA) {
 1513         return false;
 1514       }
 1515       break;
 1516     case Op_MacroLogicV:
 1517       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1518         return false;
 1519       }
 1520       break;
 1521 
 1522     case Op_VectorCmpMasked:
 1523     case Op_VectorMaskGen:
 1524       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1525         return false;
 1526       }
 1527       break;
 1528     case Op_VectorMaskFirstTrue:
 1529     case Op_VectorMaskLastTrue:
 1530     case Op_VectorMaskTrueCount:
 1531     case Op_VectorMaskToLong:
 1532       if (UseAVX < 1) {
 1533          return false;
 1534       }
 1535       break;
 1536     case Op_RoundF:
 1537     case Op_RoundD:
 1538       break;
 1539     case Op_CopySignD:
 1540     case Op_CopySignF:
 1541       if (UseAVX < 3)  {
 1542         return false;
 1543       }
 1544       if (!VM_Version::supports_avx512vl()) {
 1545         return false;
 1546       }
 1547       break;
 1548     case Op_CompressBits:
 1549     case Op_ExpandBits:
 1550       if (!VM_Version::supports_bmi2()) {
 1551         return false;
 1552       }
 1553       break;
 1554     case Op_CompressM:
 1555       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1556         return false;
 1557       }
 1558       break;
 1559     case Op_ConvF2HF:
 1560     case Op_ConvHF2F:
 1561       if (!VM_Version::supports_float16()) {
 1562         return false;
 1563       }
 1564       break;
 1565     case Op_VectorCastF2HF:
 1566     case Op_VectorCastHF2F:
 1567       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1568         return false;
 1569       }
 1570       break;
 1571   }
 1572   return true;  // Match rules are supported by default.
 1573 }
 1574 
 1575 //------------------------------------------------------------------------
 1576 
 1577 static inline bool is_pop_count_instr_target(BasicType bt) {
 1578   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1579          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1580 }
 1581 
 1582 bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
 1583   return match_rule_supported_vector(opcode, vlen, bt);
 1584 }
 1585 
 1586 // Identify extra cases that we might want to provide match rules for vector nodes and
 1587 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1588 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1589   if (!match_rule_supported(opcode)) {
 1590     return false;
 1591   }
 1592   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1593   //   * SSE2 supports 128bit vectors for all types;
 1594   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1595   //   * AVX2 supports 256bit vectors for all types;
 1596   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1597   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1598   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1599   // And MaxVectorSize is taken into account as well.
 1600   if (!vector_size_supported(bt, vlen)) {
 1601     return false;
 1602   }
 1603   // Special cases which require vector length follow:
 1604   //   * implementation limitations
 1605   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1606   //   * 128bit vroundpd instruction is present only in AVX1
 1607   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1608   switch (opcode) {
 1609     case Op_MaxVHF:
 1610     case Op_MinVHF:
 1611       if (!VM_Version::supports_avx512bw()) {
 1612         return false;
 1613       }
 1614     case Op_AddVHF:
 1615     case Op_DivVHF:
 1616     case Op_FmaVHF:
 1617     case Op_MulVHF:
 1618     case Op_SubVHF:
 1619     case Op_SqrtVHF:
 1620       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1621         return false;
 1622       }
 1623       if (!VM_Version::supports_avx512_fp16()) {
 1624         return false;
 1625       }
 1626       break;
 1627     case Op_AbsVF:
 1628     case Op_NegVF:
 1629       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1630         return false; // 512bit vandps and vxorps are not available
 1631       }
 1632       break;
 1633     case Op_AbsVD:
 1634     case Op_NegVD:
 1635       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1636         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1637       }
 1638       break;
 1639     case Op_RotateRightV:
 1640     case Op_RotateLeftV:
 1641       if (bt != T_INT && bt != T_LONG) {
 1642         return false;
 1643       } // fallthrough
 1644     case Op_MacroLogicV:
 1645       if (!VM_Version::supports_evex() ||
 1646           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_ClearArray:
 1651     case Op_VectorMaskGen:
 1652     case Op_VectorCmpMasked:
 1653       if (!VM_Version::supports_avx512bw()) {
 1654         return false;
 1655       }
 1656       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_LoadVectorMasked:
 1661     case Op_StoreVectorMasked:
 1662       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_UMinV:
 1667     case Op_UMaxV:
 1668       if (UseAVX == 0) {
 1669         return false;
 1670       }
 1671       break;
 1672     case Op_MaxV:
 1673     case Op_MinV:
 1674       if (UseSSE < 4 && is_integral_type(bt)) {
 1675         return false;
 1676       }
 1677       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1678           // Float/Double intrinsics are enabled for AVX family currently.
 1679           if (UseAVX == 0) {
 1680             return false;
 1681           }
 1682           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1683             return false;
 1684           }
 1685       }
 1686       break;
 1687     case Op_CallLeafVector:
 1688       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1689         return false;
 1690       }
 1691       break;
 1692     case Op_AddReductionVI:
 1693       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1694         return false;
 1695       }
 1696       // fallthrough
 1697     case Op_AndReductionV:
 1698     case Op_OrReductionV:
 1699     case Op_XorReductionV:
 1700       if (is_subword_type(bt) && (UseSSE < 4)) {
 1701         return false;
 1702       }
 1703       break;
 1704     case Op_MinReductionV:
 1705     case Op_MaxReductionV:
 1706       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1707         return false;
 1708       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1709         return false;
 1710       }
 1711       // Float/Double intrinsics enabled for AVX family.
 1712       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1713         return false;
 1714       }
 1715       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1716         return false;
 1717       }
 1718       break;
 1719     case Op_VectorTest:
 1720       if (UseSSE < 4) {
 1721         return false; // Implementation limitation
 1722       } else if (size_in_bits < 32) {
 1723         return false; // Implementation limitation
 1724       }
 1725       break;
 1726     case Op_VectorLoadShuffle:
 1727     case Op_VectorRearrange:
 1728       if(vlen == 2) {
 1729         return false; // Implementation limitation due to how shuffle is loaded
 1730       } else if (size_in_bits == 256 && UseAVX < 2) {
 1731         return false; // Implementation limitation
 1732       }
 1733       break;
 1734     case Op_VectorLoadMask:
 1735     case Op_VectorMaskCast:
 1736       if (size_in_bits == 256 && UseAVX < 2) {
 1737         return false; // Implementation limitation
 1738       }
 1739       // fallthrough
 1740     case Op_VectorStoreMask:
 1741       if (vlen == 2) {
 1742         return false; // Implementation limitation
 1743       }
 1744       break;
 1745     case Op_PopulateIndex:
 1746       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1747         return false;
 1748       }
 1749       break;
 1750     case Op_VectorCastB2X:
 1751     case Op_VectorCastS2X:
 1752     case Op_VectorCastI2X:
 1753       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_VectorCastL2X:
 1758       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1759         return false;
 1760       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1761         return false;
 1762       }
 1763       break;
 1764     case Op_VectorCastF2X: {
 1765         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1766         // happen after intermediate conversion to integer and special handling
 1767         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1768         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1769         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1770           return false;
 1771         }
 1772       }
 1773       // fallthrough
 1774     case Op_VectorCastD2X:
 1775       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_VectorCastF2HF:
 1780     case Op_VectorCastHF2F:
 1781       if (!VM_Version::supports_f16c() &&
 1782          ((!VM_Version::supports_evex() ||
 1783          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1784         return false;
 1785       }
 1786       break;
 1787     case Op_RoundVD:
 1788       if (!VM_Version::supports_avx512dq()) {
 1789         return false;
 1790       }
 1791       break;
 1792     case Op_MulReductionVI:
 1793       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1794         return false;
 1795       }
 1796       break;
 1797     case Op_LoadVectorGatherMasked:
 1798       if (!is_subword_type(bt) && size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1799         return false;
 1800       }
 1801       if (is_subword_type(bt) &&
 1802          ((size_in_bits > 256 && !VM_Version::supports_avx512bw()) ||
 1803           (size_in_bits < 64)                                      ||
 1804           (bt == T_SHORT && !VM_Version::supports_bmi2()))) {
 1805         return false;
 1806       }
 1807       break;
 1808     case Op_StoreVectorScatterMasked:
 1809     case Op_StoreVectorScatter:
 1810       if (is_subword_type(bt)) {
 1811         return false;
 1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1813         return false;
 1814       }
 1815       // fallthrough
 1816     case Op_LoadVectorGather:
 1817       if (!is_subword_type(bt) && size_in_bits == 64) {
 1818         return false;
 1819       }
 1820       if (is_subword_type(bt) && size_in_bits < 64) {
 1821         return false;
 1822       }
 1823       break;
 1824     case Op_SaturatingAddV:
 1825     case Op_SaturatingSubV:
 1826       if (UseAVX < 1) {
 1827         return false; // Implementation limitation
 1828       }
 1829       if (is_subword_type(bt) && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1830         return false;
 1831       }
 1832       break;
 1833     case Op_SelectFromTwoVector:
 1834        if (size_in_bits < 128) {
 1835          return false;
 1836        }
 1837        if ((size_in_bits < 512 && !VM_Version::supports_avx512vl())) {
 1838          return false;
 1839        }
 1840        if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 1841          return false;
 1842        }
 1843        if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 1844          return false;
 1845        }
 1846        if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
 1847          return false;
 1848        }
 1849        break;
 1850     case Op_MaskAll:
 1851       if (!VM_Version::supports_evex()) {
 1852         return false;
 1853       }
 1854       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1855         return false;
 1856       }
 1857       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1858         return false;
 1859       }
 1860       break;
 1861     case Op_VectorMaskCmp:
 1862       if (vlen < 2 || size_in_bits < 32) {
 1863         return false;
 1864       }
 1865       break;
 1866     case Op_CompressM:
 1867       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1868         return false;
 1869       }
 1870       break;
 1871     case Op_CompressV:
 1872     case Op_ExpandV:
 1873       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1874         return false;
 1875       }
 1876       if (size_in_bits < 128 ) {
 1877         return false;
 1878       }
 1879     case Op_VectorLongToMask:
 1880       if (UseAVX < 1) {
 1881         return false;
 1882       }
 1883       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1884         return false;
 1885       }
 1886       break;
 1887     case Op_SignumVD:
 1888     case Op_SignumVF:
 1889       if (UseAVX < 1) {
 1890         return false;
 1891       }
 1892       break;
 1893     case Op_PopCountVI:
 1894     case Op_PopCountVL: {
 1895         if (!is_pop_count_instr_target(bt) &&
 1896             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1897           return false;
 1898         }
 1899       }
 1900       break;
 1901     case Op_ReverseV:
 1902     case Op_ReverseBytesV:
 1903       if (UseAVX < 2) {
 1904         return false;
 1905       }
 1906       break;
 1907     case Op_CountTrailingZerosV:
 1908     case Op_CountLeadingZerosV:
 1909       if (UseAVX < 2) {
 1910         return false;
 1911       }
 1912       break;
 1913   }
 1914   return true;  // Per default match rules are supported.
 1915 }
 1916 
 1917 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1918   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1919   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1920   // of their non-masked counterpart with mask edge being the differentiator.
 1921   // This routine does a strict check on the existence of masked operation patterns
 1922   // by returning a default false value for all the other opcodes apart from the
 1923   // ones whose masked instruction patterns are defined in this file.
 1924   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1925     return false;
 1926   }
 1927 
 1928   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1929   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1930     return false;
 1931   }
 1932   switch(opcode) {
 1933     // Unary masked operations
 1934     case Op_AbsVB:
 1935     case Op_AbsVS:
 1936       if(!VM_Version::supports_avx512bw()) {
 1937         return false;  // Implementation limitation
 1938       }
 1939     case Op_AbsVI:
 1940     case Op_AbsVL:
 1941       return true;
 1942 
 1943     // Ternary masked operations
 1944     case Op_FmaVF:
 1945     case Op_FmaVD:
 1946       return true;
 1947 
 1948     case Op_MacroLogicV:
 1949       if(bt != T_INT && bt != T_LONG) {
 1950         return false;
 1951       }
 1952       return true;
 1953 
 1954     // Binary masked operations
 1955     case Op_AddVB:
 1956     case Op_AddVS:
 1957     case Op_SubVB:
 1958     case Op_SubVS:
 1959     case Op_MulVS:
 1960     case Op_LShiftVS:
 1961     case Op_RShiftVS:
 1962     case Op_URShiftVS:
 1963       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1964       if (!VM_Version::supports_avx512bw()) {
 1965         return false;  // Implementation limitation
 1966       }
 1967       return true;
 1968 
 1969     case Op_MulVL:
 1970       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1971       if (!VM_Version::supports_avx512dq()) {
 1972         return false;  // Implementation limitation
 1973       }
 1974       return true;
 1975 
 1976     case Op_AndV:
 1977     case Op_OrV:
 1978     case Op_XorV:
 1979     case Op_RotateRightV:
 1980     case Op_RotateLeftV:
 1981       if (bt != T_INT && bt != T_LONG) {
 1982         return false; // Implementation limitation
 1983       }
 1984       return true;
 1985 
 1986     case Op_VectorLoadMask:
 1987       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 1988       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 1989         return false;
 1990       }
 1991       return true;
 1992 
 1993     case Op_AddVI:
 1994     case Op_AddVL:
 1995     case Op_AddVF:
 1996     case Op_AddVD:
 1997     case Op_SubVI:
 1998     case Op_SubVL:
 1999     case Op_SubVF:
 2000     case Op_SubVD:
 2001     case Op_MulVI:
 2002     case Op_MulVF:
 2003     case Op_MulVD:
 2004     case Op_DivVF:
 2005     case Op_DivVD:
 2006     case Op_SqrtVF:
 2007     case Op_SqrtVD:
 2008     case Op_LShiftVI:
 2009     case Op_LShiftVL:
 2010     case Op_RShiftVI:
 2011     case Op_RShiftVL:
 2012     case Op_URShiftVI:
 2013     case Op_URShiftVL:
 2014     case Op_LoadVectorMasked:
 2015     case Op_StoreVectorMasked:
 2016     case Op_LoadVectorGatherMasked:
 2017     case Op_StoreVectorScatterMasked:
 2018       return true;
 2019 
 2020     case Op_UMinV:
 2021     case Op_UMaxV:
 2022       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 2023         return false;
 2024       } // fallthrough
 2025     case Op_MaxV:
 2026     case Op_MinV:
 2027       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2028         return false; // Implementation limitation
 2029       }
 2030       if (is_floating_point_type(bt) && !VM_Version::supports_avx10_2()) {
 2031         return false; // Implementation limitation
 2032       }
 2033       return true;
 2034     case Op_SaturatingAddV:
 2035     case Op_SaturatingSubV:
 2036       if (!is_subword_type(bt)) {
 2037         return false;
 2038       }
 2039       if (size_in_bits < 128 || !VM_Version::supports_avx512bw()) {
 2040         return false; // Implementation limitation
 2041       }
 2042       return true;
 2043 
 2044     case Op_VectorMaskCmp:
 2045       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2046         return false; // Implementation limitation
 2047       }
 2048       return true;
 2049 
 2050     case Op_VectorRearrange:
 2051       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2052         return false; // Implementation limitation
 2053       }
 2054       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2055         return false; // Implementation limitation
 2056       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2057         return false; // Implementation limitation
 2058       }
 2059       return true;
 2060 
 2061     // Binary Logical operations
 2062     case Op_AndVMask:
 2063     case Op_OrVMask:
 2064     case Op_XorVMask:
 2065       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2066         return false; // Implementation limitation
 2067       }
 2068       return true;
 2069 
 2070     case Op_PopCountVI:
 2071     case Op_PopCountVL:
 2072       if (!is_pop_count_instr_target(bt)) {
 2073         return false;
 2074       }
 2075       return true;
 2076 
 2077     case Op_MaskAll:
 2078       return true;
 2079 
 2080     case Op_CountLeadingZerosV:
 2081       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2082         return true;
 2083       }
 2084     default:
 2085       return false;
 2086   }
 2087 }
 2088 
 2089 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2090   return false;
 2091 }
 2092 
 2093 // Return true if Vector::rearrange needs preparation of the shuffle argument
 2094 bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
 2095   switch (elem_bt) {
 2096     case T_BYTE:  return false;
 2097     case T_SHORT: return !VM_Version::supports_avx512bw();
 2098     case T_INT:   return !VM_Version::supports_avx();
 2099     case T_LONG:  return vlen < 8 && !VM_Version::supports_avx512vl();
 2100     default:
 2101       ShouldNotReachHere();
 2102       return false;
 2103   }
 2104 }
 2105 
 2106 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2107   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2108   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2109   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2110       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2111     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2112     return new legVecZOper();
 2113   }
 2114   if (legacy) {
 2115     switch (ideal_reg) {
 2116       case Op_VecS: return new legVecSOper();
 2117       case Op_VecD: return new legVecDOper();
 2118       case Op_VecX: return new legVecXOper();
 2119       case Op_VecY: return new legVecYOper();
 2120       case Op_VecZ: return new legVecZOper();
 2121     }
 2122   } else {
 2123     switch (ideal_reg) {
 2124       case Op_VecS: return new vecSOper();
 2125       case Op_VecD: return new vecDOper();
 2126       case Op_VecX: return new vecXOper();
 2127       case Op_VecY: return new vecYOper();
 2128       case Op_VecZ: return new vecZOper();
 2129     }
 2130   }
 2131   ShouldNotReachHere();
 2132   return nullptr;
 2133 }
 2134 
 2135 bool Matcher::is_reg2reg_move(MachNode* m) {
 2136   switch (m->rule()) {
 2137     case MoveVec2Leg_rule:
 2138     case MoveLeg2Vec_rule:
 2139     case MoveF2VL_rule:
 2140     case MoveF2LEG_rule:
 2141     case MoveVL2F_rule:
 2142     case MoveLEG2F_rule:
 2143     case MoveD2VL_rule:
 2144     case MoveD2LEG_rule:
 2145     case MoveVL2D_rule:
 2146     case MoveLEG2D_rule:
 2147       return true;
 2148     default:
 2149       return false;
 2150   }
 2151 }
 2152 
 2153 bool Matcher::is_generic_vector(MachOper* opnd) {
 2154   switch (opnd->opcode()) {
 2155     case VEC:
 2156     case LEGVEC:
 2157       return true;
 2158     default:
 2159       return false;
 2160   }
 2161 }
 2162 
 2163 //------------------------------------------------------------------------
 2164 
 2165 const RegMask* Matcher::predicate_reg_mask(void) {
 2166   return &_VECTMASK_REG_mask;
 2167 }
 2168 
 2169 // Max vector size in bytes. 0 if not supported.
 2170 int Matcher::vector_width_in_bytes(BasicType bt) {
 2171   assert(is_java_primitive(bt), "only primitive type vectors");
 2172   // SSE2 supports 128bit vectors for all types.
 2173   // AVX2 supports 256bit vectors for all types.
 2174   // AVX2/EVEX supports 512bit vectors for all types.
 2175   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2176   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2177   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2178     size = (UseAVX > 2) ? 64 : 32;
 2179   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2180     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2181   // Use flag to limit vector size.
 2182   size = MIN2(size,(int)MaxVectorSize);
 2183   // Minimum 2 values in vector (or 4 for bytes).
 2184   switch (bt) {
 2185   case T_DOUBLE:
 2186   case T_LONG:
 2187     if (size < 16) return 0;
 2188     break;
 2189   case T_FLOAT:
 2190   case T_INT:
 2191     if (size < 8) return 0;
 2192     break;
 2193   case T_BOOLEAN:
 2194     if (size < 4) return 0;
 2195     break;
 2196   case T_CHAR:
 2197     if (size < 4) return 0;
 2198     break;
 2199   case T_BYTE:
 2200     if (size < 4) return 0;
 2201     break;
 2202   case T_SHORT:
 2203     if (size < 4) return 0;
 2204     break;
 2205   default:
 2206     ShouldNotReachHere();
 2207   }
 2208   return size;
 2209 }
 2210 
 2211 // Limits on vector size (number of elements) loaded into vector.
 2212 int Matcher::max_vector_size(const BasicType bt) {
 2213   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2214 }
 2215 int Matcher::min_vector_size(const BasicType bt) {
 2216   int max_size = max_vector_size(bt);
 2217   // Min size which can be loaded into vector is 4 bytes.
 2218   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2219   // Support for calling svml double64 vectors
 2220   if (bt == T_DOUBLE) {
 2221     size = 1;
 2222   }
 2223   return MIN2(size,max_size);
 2224 }
 2225 
 2226 int Matcher::max_vector_size_auto_vectorization(const BasicType bt) {
 2227   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2228   // by default on Cascade Lake
 2229   if (VM_Version::is_default_intel_cascade_lake()) {
 2230     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2231   }
 2232   return Matcher::max_vector_size(bt);
 2233 }
 2234 
 2235 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2236   return -1;
 2237 }
 2238 
 2239 // Vector ideal reg corresponding to specified size in bytes
 2240 uint Matcher::vector_ideal_reg(int size) {
 2241   assert(MaxVectorSize >= size, "");
 2242   switch(size) {
 2243     case  4: return Op_VecS;
 2244     case  8: return Op_VecD;
 2245     case 16: return Op_VecX;
 2246     case 32: return Op_VecY;
 2247     case 64: return Op_VecZ;
 2248   }
 2249   ShouldNotReachHere();
 2250   return 0;
 2251 }
 2252 
 2253 // Check for shift by small constant as well
 2254 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2255   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2256       shift->in(2)->get_int() <= 3 &&
 2257       // Are there other uses besides address expressions?
 2258       !matcher->is_visited(shift)) {
 2259     address_visited.set(shift->_idx); // Flag as address_visited
 2260     mstack.push(shift->in(2), Matcher::Visit);
 2261     Node *conv = shift->in(1);
 2262     // Allow Matcher to match the rule which bypass
 2263     // ConvI2L operation for an array index on LP64
 2264     // if the index value is positive.
 2265     if (conv->Opcode() == Op_ConvI2L &&
 2266         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2267         // Are there other uses besides address expressions?
 2268         !matcher->is_visited(conv)) {
 2269       address_visited.set(conv->_idx); // Flag as address_visited
 2270       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2271     } else {
 2272       mstack.push(conv, Matcher::Pre_Visit);
 2273     }
 2274     return true;
 2275   }
 2276   return false;
 2277 }
 2278 
 2279 // This function identifies sub-graphs in which a 'load' node is
 2280 // input to two different nodes, and such that it can be matched
 2281 // with BMI instructions like blsi, blsr, etc.
 2282 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2283 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2284 // refers to the same node.
 2285 //
 2286 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2287 // This is a temporary solution until we make DAGs expressible in ADL.
 2288 template<typename ConType>
 2289 class FusedPatternMatcher {
 2290   Node* _op1_node;
 2291   Node* _mop_node;
 2292   int _con_op;
 2293 
 2294   static int match_next(Node* n, int next_op, int next_op_idx) {
 2295     if (n->in(1) == nullptr || n->in(2) == nullptr) {
 2296       return -1;
 2297     }
 2298 
 2299     if (next_op_idx == -1) { // n is commutative, try rotations
 2300       if (n->in(1)->Opcode() == next_op) {
 2301         return 1;
 2302       } else if (n->in(2)->Opcode() == next_op) {
 2303         return 2;
 2304       }
 2305     } else {
 2306       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2307       if (n->in(next_op_idx)->Opcode() == next_op) {
 2308         return next_op_idx;
 2309       }
 2310     }
 2311     return -1;
 2312   }
 2313 
 2314  public:
 2315   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2316     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2317 
 2318   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2319              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2320              typename ConType::NativeType con_value) {
 2321     if (_op1_node->Opcode() != op1) {
 2322       return false;
 2323     }
 2324     if (_mop_node->outcnt() > 2) {
 2325       return false;
 2326     }
 2327     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2328     if (op1_op2_idx == -1) {
 2329       return false;
 2330     }
 2331     // Memory operation must be the other edge
 2332     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2333 
 2334     // Check that the mop node is really what we want
 2335     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2336       Node* op2_node = _op1_node->in(op1_op2_idx);
 2337       if (op2_node->outcnt() > 1) {
 2338         return false;
 2339       }
 2340       assert(op2_node->Opcode() == op2, "Should be");
 2341       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2342       if (op2_con_idx == -1) {
 2343         return false;
 2344       }
 2345       // Memory operation must be the other edge
 2346       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2347       // Check that the memory operation is the same node
 2348       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2349         // Now check the constant
 2350         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2351         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2352           return true;
 2353         }
 2354       }
 2355     }
 2356     return false;
 2357   }
 2358 };
 2359 
 2360 static bool is_bmi_pattern(Node* n, Node* m) {
 2361   assert(UseBMI1Instructions, "sanity");
 2362   if (n != nullptr && m != nullptr) {
 2363     if (m->Opcode() == Op_LoadI) {
 2364       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2365       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2366              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2367              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2368     } else if (m->Opcode() == Op_LoadL) {
 2369       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2370       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2371              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2372              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2373     }
 2374   }
 2375   return false;
 2376 }
 2377 
 2378 // Should the matcher clone input 'm' of node 'n'?
 2379 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2380   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2381   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2382     mstack.push(m, Visit);
 2383     return true;
 2384   }
 2385   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2386     mstack.push(m, Visit);           // m = ShiftCntV
 2387     return true;
 2388   }
 2389   if (is_encode_and_store_pattern(n, m)) {
 2390     mstack.push(m, Visit);
 2391     return true;
 2392   }
 2393   return false;
 2394 }
 2395 
 2396 // Should the Matcher clone shifts on addressing modes, expecting them
 2397 // to be subsumed into complex addressing expressions or compute them
 2398 // into registers?
 2399 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2400   Node *off = m->in(AddPNode::Offset);
 2401   if (off->is_Con()) {
 2402     address_visited.test_set(m->_idx); // Flag as address_visited
 2403     Node *adr = m->in(AddPNode::Address);
 2404 
 2405     // Intel can handle 2 adds in addressing mode, with one of them using an immediate offset.
 2406     // AtomicAdd is not an addressing expression.
 2407     // Cheap to find it by looking for screwy base.
 2408     if (adr->is_AddP() &&
 2409         !adr->in(AddPNode::Base)->is_top() &&
 2410         !adr->in(AddPNode::Offset)->is_Con() &&
 2411         off->get_long() == (int) (off->get_long()) && // immL32
 2412         // Are there other uses besides address expressions?
 2413         !is_visited(adr)) {
 2414       address_visited.set(adr->_idx); // Flag as address_visited
 2415       Node *shift = adr->in(AddPNode::Offset);
 2416       if (!clone_shift(shift, this, mstack, address_visited)) {
 2417         mstack.push(shift, Pre_Visit);
 2418       }
 2419       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2420       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2421     } else {
 2422       mstack.push(adr, Pre_Visit);
 2423     }
 2424 
 2425     // Clone X+offset as it also folds into most addressing expressions
 2426     mstack.push(off, Visit);
 2427     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2428     return true;
 2429   } else if (clone_shift(off, this, mstack, address_visited)) {
 2430     address_visited.test_set(m->_idx); // Flag as address_visited
 2431     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2432     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2433     return true;
 2434   }
 2435   return false;
 2436 }
 2437 
 2438 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2439   switch (bt) {
 2440     case BoolTest::eq:
 2441       return Assembler::eq;
 2442     case BoolTest::ne:
 2443       return Assembler::neq;
 2444     case BoolTest::le:
 2445     case BoolTest::ule:
 2446       return Assembler::le;
 2447     case BoolTest::ge:
 2448     case BoolTest::uge:
 2449       return Assembler::nlt;
 2450     case BoolTest::lt:
 2451     case BoolTest::ult:
 2452       return Assembler::lt;
 2453     case BoolTest::gt:
 2454     case BoolTest::ugt:
 2455       return Assembler::nle;
 2456     default : ShouldNotReachHere(); return Assembler::_false;
 2457   }
 2458 }
 2459 
 2460 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2461   switch (bt) {
 2462   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2463   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2464   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2465   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2466   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2467   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2468   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2469   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2470   }
 2471 }
 2472 
 2473 // Helper methods for MachSpillCopyNode::implementation().
 2474 static void vec_mov_helper(C2_MacroAssembler *masm, int src_lo, int dst_lo,
 2475                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2476   assert(ireg == Op_VecS || // 32bit vector
 2477          ((src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2478           (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi),
 2479          "no non-adjacent vector moves" );
 2480   if (masm) {
 2481     switch (ireg) {
 2482     case Op_VecS: // copy whole register
 2483     case Op_VecD:
 2484     case Op_VecX:
 2485       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2486         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2487       } else {
 2488         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2489      }
 2490       break;
 2491     case Op_VecY:
 2492       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2493         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2494       } else {
 2495         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2496      }
 2497       break;
 2498     case Op_VecZ:
 2499       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2500       break;
 2501     default:
 2502       ShouldNotReachHere();
 2503     }
 2504 #ifndef PRODUCT
 2505   } else {
 2506     switch (ireg) {
 2507     case Op_VecS:
 2508     case Op_VecD:
 2509     case Op_VecX:
 2510       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2511       break;
 2512     case Op_VecY:
 2513     case Op_VecZ:
 2514       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2515       break;
 2516     default:
 2517       ShouldNotReachHere();
 2518     }
 2519 #endif
 2520   }
 2521 }
 2522 
 2523 void vec_spill_helper(C2_MacroAssembler *masm, bool is_load,
 2524                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2525   if (masm) {
 2526     if (is_load) {
 2527       switch (ireg) {
 2528       case Op_VecS:
 2529         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2530         break;
 2531       case Op_VecD:
 2532         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2533         break;
 2534       case Op_VecX:
 2535         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2536           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2537         } else {
 2538           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2539           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2540         }
 2541         break;
 2542       case Op_VecY:
 2543         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2544           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2545         } else {
 2546           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2547           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2548         }
 2549         break;
 2550       case Op_VecZ:
 2551         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2552         break;
 2553       default:
 2554         ShouldNotReachHere();
 2555       }
 2556     } else { // store
 2557       switch (ireg) {
 2558       case Op_VecS:
 2559         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2560         break;
 2561       case Op_VecD:
 2562         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2563         break;
 2564       case Op_VecX:
 2565         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2566           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2567         }
 2568         else {
 2569           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2570         }
 2571         break;
 2572       case Op_VecY:
 2573         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2574           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2575         }
 2576         else {
 2577           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2578         }
 2579         break;
 2580       case Op_VecZ:
 2581         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2582         break;
 2583       default:
 2584         ShouldNotReachHere();
 2585       }
 2586     }
 2587 #ifndef PRODUCT
 2588   } else {
 2589     if (is_load) {
 2590       switch (ireg) {
 2591       case Op_VecS:
 2592         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2593         break;
 2594       case Op_VecD:
 2595         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2596         break;
 2597        case Op_VecX:
 2598         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2599         break;
 2600       case Op_VecY:
 2601       case Op_VecZ:
 2602         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2603         break;
 2604       default:
 2605         ShouldNotReachHere();
 2606       }
 2607     } else { // store
 2608       switch (ireg) {
 2609       case Op_VecS:
 2610         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2611         break;
 2612       case Op_VecD:
 2613         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2614         break;
 2615        case Op_VecX:
 2616         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2617         break;
 2618       case Op_VecY:
 2619       case Op_VecZ:
 2620         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2621         break;
 2622       default:
 2623         ShouldNotReachHere();
 2624       }
 2625     }
 2626 #endif
 2627   }
 2628 }
 2629 
 2630 template <class T>
 2631 static inline GrowableArray<jbyte>* vreplicate_imm(BasicType bt, T con, int len) {
 2632   int size = type2aelembytes(bt) * len;
 2633   GrowableArray<jbyte>* val = new GrowableArray<jbyte>(size, size, 0);
 2634   for (int i = 0; i < len; i++) {
 2635     int offset = i * type2aelembytes(bt);
 2636     switch (bt) {
 2637       case T_BYTE: val->at(i) = con; break;
 2638       case T_SHORT: {
 2639         jshort c = con;
 2640         memcpy(val->adr_at(offset), &c, sizeof(jshort));
 2641         break;
 2642       }
 2643       case T_INT: {
 2644         jint c = con;
 2645         memcpy(val->adr_at(offset), &c, sizeof(jint));
 2646         break;
 2647       }
 2648       case T_LONG: {
 2649         jlong c = con;
 2650         memcpy(val->adr_at(offset), &c, sizeof(jlong));
 2651         break;
 2652       }
 2653       case T_FLOAT: {
 2654         jfloat c = con;
 2655         memcpy(val->adr_at(offset), &c, sizeof(jfloat));
 2656         break;
 2657       }
 2658       case T_DOUBLE: {
 2659         jdouble c = con;
 2660         memcpy(val->adr_at(offset), &c, sizeof(jdouble));
 2661         break;
 2662       }
 2663       default: assert(false, "%s", type2name(bt));
 2664     }
 2665   }
 2666   return val;
 2667 }
 2668 
 2669 static inline jlong high_bit_set(BasicType bt) {
 2670   switch (bt) {
 2671     case T_BYTE:  return 0x8080808080808080;
 2672     case T_SHORT: return 0x8000800080008000;
 2673     case T_INT:   return 0x8000000080000000;
 2674     case T_LONG:  return 0x8000000000000000;
 2675     default:
 2676       ShouldNotReachHere();
 2677       return 0;
 2678   }
 2679 }
 2680 
 2681 #ifndef PRODUCT
 2682   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2683     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2684   }
 2685 #endif
 2686 
 2687   void MachNopNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc*) const {
 2688     __ nop(_count);
 2689   }
 2690 
 2691   uint MachNopNode::size(PhaseRegAlloc*) const {
 2692     return _count;
 2693   }
 2694 
 2695 #ifndef PRODUCT
 2696   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2697     st->print("# breakpoint");
 2698   }
 2699 #endif
 2700 
 2701   void MachBreakpointNode::emit(C2_MacroAssembler *masm, PhaseRegAlloc* ra_) const {
 2702     __ int3();
 2703   }
 2704 
 2705   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2706     return MachNode::size(ra_);
 2707   }
 2708 
 2709 %}
 2710 
 2711 encode %{
 2712 
 2713   enc_class call_epilog %{
 2714     if (VerifyStackAtCalls) {
 2715       // Check that stack depth is unchanged: find majik cookie on stack
 2716       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2717       Label L;
 2718       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2719       __ jccb(Assembler::equal, L);
 2720       // Die if stack mismatch
 2721       __ int3();
 2722       __ bind(L);
 2723     }
 2724   %}
 2725 
 2726 %}
 2727 
 2728 // Operands for bound floating pointer register arguments
 2729 operand rxmm0() %{
 2730   constraint(ALLOC_IN_RC(xmm0_reg));
 2731   match(VecX);
 2732   format%{%}
 2733   interface(REG_INTER);
 2734 %}
 2735 
 2736 //----------OPERANDS-----------------------------------------------------------
 2737 // Operand definitions must precede instruction definitions for correct parsing
 2738 // in the ADLC because operands constitute user defined types which are used in
 2739 // instruction definitions.
 2740 
 2741 // Vectors
 2742 
 2743 // Dummy generic vector class. Should be used for all vector operands.
 2744 // Replaced with vec[SDXYZ] during post-selection pass.
 2745 operand vec() %{
 2746   constraint(ALLOC_IN_RC(dynamic));
 2747   match(VecX);
 2748   match(VecY);
 2749   match(VecZ);
 2750   match(VecS);
 2751   match(VecD);
 2752 
 2753   format %{ %}
 2754   interface(REG_INTER);
 2755 %}
 2756 
 2757 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2758 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2759 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2760 // runtime code generation via reg_class_dynamic.
 2761 operand legVec() %{
 2762   constraint(ALLOC_IN_RC(dynamic));
 2763   match(VecX);
 2764   match(VecY);
 2765   match(VecZ);
 2766   match(VecS);
 2767   match(VecD);
 2768 
 2769   format %{ %}
 2770   interface(REG_INTER);
 2771 %}
 2772 
 2773 // Replaces vec during post-selection cleanup. See above.
 2774 operand vecS() %{
 2775   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2776   match(VecS);
 2777 
 2778   format %{ %}
 2779   interface(REG_INTER);
 2780 %}
 2781 
 2782 // Replaces legVec during post-selection cleanup. See above.
 2783 operand legVecS() %{
 2784   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2785   match(VecS);
 2786 
 2787   format %{ %}
 2788   interface(REG_INTER);
 2789 %}
 2790 
 2791 // Replaces vec during post-selection cleanup. See above.
 2792 operand vecD() %{
 2793   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2794   match(VecD);
 2795 
 2796   format %{ %}
 2797   interface(REG_INTER);
 2798 %}
 2799 
 2800 // Replaces legVec during post-selection cleanup. See above.
 2801 operand legVecD() %{
 2802   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2803   match(VecD);
 2804 
 2805   format %{ %}
 2806   interface(REG_INTER);
 2807 %}
 2808 
 2809 // Replaces vec during post-selection cleanup. See above.
 2810 operand vecX() %{
 2811   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2812   match(VecX);
 2813 
 2814   format %{ %}
 2815   interface(REG_INTER);
 2816 %}
 2817 
 2818 // Replaces legVec during post-selection cleanup. See above.
 2819 operand legVecX() %{
 2820   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2821   match(VecX);
 2822 
 2823   format %{ %}
 2824   interface(REG_INTER);
 2825 %}
 2826 
 2827 // Replaces vec during post-selection cleanup. See above.
 2828 operand vecY() %{
 2829   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2830   match(VecY);
 2831 
 2832   format %{ %}
 2833   interface(REG_INTER);
 2834 %}
 2835 
 2836 // Replaces legVec during post-selection cleanup. See above.
 2837 operand legVecY() %{
 2838   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2839   match(VecY);
 2840 
 2841   format %{ %}
 2842   interface(REG_INTER);
 2843 %}
 2844 
 2845 // Replaces vec during post-selection cleanup. See above.
 2846 operand vecZ() %{
 2847   constraint(ALLOC_IN_RC(vectorz_reg));
 2848   match(VecZ);
 2849 
 2850   format %{ %}
 2851   interface(REG_INTER);
 2852 %}
 2853 
 2854 // Replaces legVec during post-selection cleanup. See above.
 2855 operand legVecZ() %{
 2856   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2857   match(VecZ);
 2858 
 2859   format %{ %}
 2860   interface(REG_INTER);
 2861 %}
 2862 
 2863 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2864 
 2865 // ============================================================================
 2866 
 2867 instruct ShouldNotReachHere() %{
 2868   match(Halt);
 2869   format %{ "stop\t# ShouldNotReachHere" %}
 2870   ins_encode %{
 2871     if (is_reachable()) {
 2872       const char* str = __ code_string(_halt_reason);
 2873       __ stop(str);
 2874     }
 2875   %}
 2876   ins_pipe(pipe_slow);
 2877 %}
 2878 
 2879 // ============================================================================
 2880 
 2881 instruct addF_reg(regF dst, regF src) %{
 2882   predicate(UseAVX == 0);
 2883   match(Set dst (AddF dst src));
 2884 
 2885   format %{ "addss   $dst, $src" %}
 2886   ins_cost(150);
 2887   ins_encode %{
 2888     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2889   %}
 2890   ins_pipe(pipe_slow);
 2891 %}
 2892 
 2893 instruct addF_mem(regF dst, memory src) %{
 2894   predicate(UseAVX == 0);
 2895   match(Set dst (AddF dst (LoadF src)));
 2896 
 2897   format %{ "addss   $dst, $src" %}
 2898   ins_cost(150);
 2899   ins_encode %{
 2900     __ addss($dst$$XMMRegister, $src$$Address);
 2901   %}
 2902   ins_pipe(pipe_slow);
 2903 %}
 2904 
 2905 instruct addF_imm(regF dst, immF con) %{
 2906   predicate(UseAVX == 0);
 2907   match(Set dst (AddF dst con));
 2908   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2909   ins_cost(150);
 2910   ins_encode %{
 2911     __ addss($dst$$XMMRegister, $constantaddress($con));
 2912   %}
 2913   ins_pipe(pipe_slow);
 2914 %}
 2915 
 2916 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2917   predicate(UseAVX > 0);
 2918   match(Set dst (AddF src1 src2));
 2919 
 2920   format %{ "vaddss  $dst, $src1, $src2" %}
 2921   ins_cost(150);
 2922   ins_encode %{
 2923     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2924   %}
 2925   ins_pipe(pipe_slow);
 2926 %}
 2927 
 2928 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2929   predicate(UseAVX > 0);
 2930   match(Set dst (AddF src1 (LoadF src2)));
 2931 
 2932   format %{ "vaddss  $dst, $src1, $src2" %}
 2933   ins_cost(150);
 2934   ins_encode %{
 2935     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2936   %}
 2937   ins_pipe(pipe_slow);
 2938 %}
 2939 
 2940 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2941   predicate(UseAVX > 0);
 2942   match(Set dst (AddF src con));
 2943 
 2944   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2945   ins_cost(150);
 2946   ins_encode %{
 2947     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2948   %}
 2949   ins_pipe(pipe_slow);
 2950 %}
 2951 
 2952 instruct addD_reg(regD dst, regD src) %{
 2953   predicate(UseAVX == 0);
 2954   match(Set dst (AddD dst src));
 2955 
 2956   format %{ "addsd   $dst, $src" %}
 2957   ins_cost(150);
 2958   ins_encode %{
 2959     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2960   %}
 2961   ins_pipe(pipe_slow);
 2962 %}
 2963 
 2964 instruct addD_mem(regD dst, memory src) %{
 2965   predicate(UseAVX == 0);
 2966   match(Set dst (AddD dst (LoadD src)));
 2967 
 2968   format %{ "addsd   $dst, $src" %}
 2969   ins_cost(150);
 2970   ins_encode %{
 2971     __ addsd($dst$$XMMRegister, $src$$Address);
 2972   %}
 2973   ins_pipe(pipe_slow);
 2974 %}
 2975 
 2976 instruct addD_imm(regD dst, immD con) %{
 2977   predicate(UseAVX == 0);
 2978   match(Set dst (AddD dst con));
 2979   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 2980   ins_cost(150);
 2981   ins_encode %{
 2982     __ addsd($dst$$XMMRegister, $constantaddress($con));
 2983   %}
 2984   ins_pipe(pipe_slow);
 2985 %}
 2986 
 2987 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 2988   predicate(UseAVX > 0);
 2989   match(Set dst (AddD src1 src2));
 2990 
 2991   format %{ "vaddsd  $dst, $src1, $src2" %}
 2992   ins_cost(150);
 2993   ins_encode %{
 2994     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2995   %}
 2996   ins_pipe(pipe_slow);
 2997 %}
 2998 
 2999 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3000   predicate(UseAVX > 0);
 3001   match(Set dst (AddD src1 (LoadD src2)));
 3002 
 3003   format %{ "vaddsd  $dst, $src1, $src2" %}
 3004   ins_cost(150);
 3005   ins_encode %{
 3006     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3007   %}
 3008   ins_pipe(pipe_slow);
 3009 %}
 3010 
 3011 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3012   predicate(UseAVX > 0);
 3013   match(Set dst (AddD src con));
 3014 
 3015   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3016   ins_cost(150);
 3017   ins_encode %{
 3018     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3019   %}
 3020   ins_pipe(pipe_slow);
 3021 %}
 3022 
 3023 instruct subF_reg(regF dst, regF src) %{
 3024   predicate(UseAVX == 0);
 3025   match(Set dst (SubF dst src));
 3026 
 3027   format %{ "subss   $dst, $src" %}
 3028   ins_cost(150);
 3029   ins_encode %{
 3030     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3031   %}
 3032   ins_pipe(pipe_slow);
 3033 %}
 3034 
 3035 instruct subF_mem(regF dst, memory src) %{
 3036   predicate(UseAVX == 0);
 3037   match(Set dst (SubF dst (LoadF src)));
 3038 
 3039   format %{ "subss   $dst, $src" %}
 3040   ins_cost(150);
 3041   ins_encode %{
 3042     __ subss($dst$$XMMRegister, $src$$Address);
 3043   %}
 3044   ins_pipe(pipe_slow);
 3045 %}
 3046 
 3047 instruct subF_imm(regF dst, immF con) %{
 3048   predicate(UseAVX == 0);
 3049   match(Set dst (SubF dst con));
 3050   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3051   ins_cost(150);
 3052   ins_encode %{
 3053     __ subss($dst$$XMMRegister, $constantaddress($con));
 3054   %}
 3055   ins_pipe(pipe_slow);
 3056 %}
 3057 
 3058 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3059   predicate(UseAVX > 0);
 3060   match(Set dst (SubF src1 src2));
 3061 
 3062   format %{ "vsubss  $dst, $src1, $src2" %}
 3063   ins_cost(150);
 3064   ins_encode %{
 3065     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3066   %}
 3067   ins_pipe(pipe_slow);
 3068 %}
 3069 
 3070 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3071   predicate(UseAVX > 0);
 3072   match(Set dst (SubF src1 (LoadF src2)));
 3073 
 3074   format %{ "vsubss  $dst, $src1, $src2" %}
 3075   ins_cost(150);
 3076   ins_encode %{
 3077     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3078   %}
 3079   ins_pipe(pipe_slow);
 3080 %}
 3081 
 3082 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3083   predicate(UseAVX > 0);
 3084   match(Set dst (SubF src con));
 3085 
 3086   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3087   ins_cost(150);
 3088   ins_encode %{
 3089     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3090   %}
 3091   ins_pipe(pipe_slow);
 3092 %}
 3093 
 3094 instruct subD_reg(regD dst, regD src) %{
 3095   predicate(UseAVX == 0);
 3096   match(Set dst (SubD dst src));
 3097 
 3098   format %{ "subsd   $dst, $src" %}
 3099   ins_cost(150);
 3100   ins_encode %{
 3101     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3102   %}
 3103   ins_pipe(pipe_slow);
 3104 %}
 3105 
 3106 instruct subD_mem(regD dst, memory src) %{
 3107   predicate(UseAVX == 0);
 3108   match(Set dst (SubD dst (LoadD src)));
 3109 
 3110   format %{ "subsd   $dst, $src" %}
 3111   ins_cost(150);
 3112   ins_encode %{
 3113     __ subsd($dst$$XMMRegister, $src$$Address);
 3114   %}
 3115   ins_pipe(pipe_slow);
 3116 %}
 3117 
 3118 instruct subD_imm(regD dst, immD con) %{
 3119   predicate(UseAVX == 0);
 3120   match(Set dst (SubD dst con));
 3121   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3122   ins_cost(150);
 3123   ins_encode %{
 3124     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3125   %}
 3126   ins_pipe(pipe_slow);
 3127 %}
 3128 
 3129 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3130   predicate(UseAVX > 0);
 3131   match(Set dst (SubD src1 src2));
 3132 
 3133   format %{ "vsubsd  $dst, $src1, $src2" %}
 3134   ins_cost(150);
 3135   ins_encode %{
 3136     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3137   %}
 3138   ins_pipe(pipe_slow);
 3139 %}
 3140 
 3141 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3142   predicate(UseAVX > 0);
 3143   match(Set dst (SubD src1 (LoadD src2)));
 3144 
 3145   format %{ "vsubsd  $dst, $src1, $src2" %}
 3146   ins_cost(150);
 3147   ins_encode %{
 3148     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3149   %}
 3150   ins_pipe(pipe_slow);
 3151 %}
 3152 
 3153 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3154   predicate(UseAVX > 0);
 3155   match(Set dst (SubD src con));
 3156 
 3157   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3158   ins_cost(150);
 3159   ins_encode %{
 3160     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3161   %}
 3162   ins_pipe(pipe_slow);
 3163 %}
 3164 
 3165 instruct mulF_reg(regF dst, regF src) %{
 3166   predicate(UseAVX == 0);
 3167   match(Set dst (MulF dst src));
 3168 
 3169   format %{ "mulss   $dst, $src" %}
 3170   ins_cost(150);
 3171   ins_encode %{
 3172     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3173   %}
 3174   ins_pipe(pipe_slow);
 3175 %}
 3176 
 3177 instruct mulF_mem(regF dst, memory src) %{
 3178   predicate(UseAVX == 0);
 3179   match(Set dst (MulF dst (LoadF src)));
 3180 
 3181   format %{ "mulss   $dst, $src" %}
 3182   ins_cost(150);
 3183   ins_encode %{
 3184     __ mulss($dst$$XMMRegister, $src$$Address);
 3185   %}
 3186   ins_pipe(pipe_slow);
 3187 %}
 3188 
 3189 instruct mulF_imm(regF dst, immF con) %{
 3190   predicate(UseAVX == 0);
 3191   match(Set dst (MulF dst con));
 3192   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3193   ins_cost(150);
 3194   ins_encode %{
 3195     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3196   %}
 3197   ins_pipe(pipe_slow);
 3198 %}
 3199 
 3200 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3201   predicate(UseAVX > 0);
 3202   match(Set dst (MulF src1 src2));
 3203 
 3204   format %{ "vmulss  $dst, $src1, $src2" %}
 3205   ins_cost(150);
 3206   ins_encode %{
 3207     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3208   %}
 3209   ins_pipe(pipe_slow);
 3210 %}
 3211 
 3212 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3213   predicate(UseAVX > 0);
 3214   match(Set dst (MulF src1 (LoadF src2)));
 3215 
 3216   format %{ "vmulss  $dst, $src1, $src2" %}
 3217   ins_cost(150);
 3218   ins_encode %{
 3219     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3220   %}
 3221   ins_pipe(pipe_slow);
 3222 %}
 3223 
 3224 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3225   predicate(UseAVX > 0);
 3226   match(Set dst (MulF src con));
 3227 
 3228   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3229   ins_cost(150);
 3230   ins_encode %{
 3231     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3232   %}
 3233   ins_pipe(pipe_slow);
 3234 %}
 3235 
 3236 instruct mulD_reg(regD dst, regD src) %{
 3237   predicate(UseAVX == 0);
 3238   match(Set dst (MulD dst src));
 3239 
 3240   format %{ "mulsd   $dst, $src" %}
 3241   ins_cost(150);
 3242   ins_encode %{
 3243     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3244   %}
 3245   ins_pipe(pipe_slow);
 3246 %}
 3247 
 3248 instruct mulD_mem(regD dst, memory src) %{
 3249   predicate(UseAVX == 0);
 3250   match(Set dst (MulD dst (LoadD src)));
 3251 
 3252   format %{ "mulsd   $dst, $src" %}
 3253   ins_cost(150);
 3254   ins_encode %{
 3255     __ mulsd($dst$$XMMRegister, $src$$Address);
 3256   %}
 3257   ins_pipe(pipe_slow);
 3258 %}
 3259 
 3260 instruct mulD_imm(regD dst, immD con) %{
 3261   predicate(UseAVX == 0);
 3262   match(Set dst (MulD dst con));
 3263   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3264   ins_cost(150);
 3265   ins_encode %{
 3266     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3267   %}
 3268   ins_pipe(pipe_slow);
 3269 %}
 3270 
 3271 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3272   predicate(UseAVX > 0);
 3273   match(Set dst (MulD src1 src2));
 3274 
 3275   format %{ "vmulsd  $dst, $src1, $src2" %}
 3276   ins_cost(150);
 3277   ins_encode %{
 3278     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3279   %}
 3280   ins_pipe(pipe_slow);
 3281 %}
 3282 
 3283 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3284   predicate(UseAVX > 0);
 3285   match(Set dst (MulD src1 (LoadD src2)));
 3286 
 3287   format %{ "vmulsd  $dst, $src1, $src2" %}
 3288   ins_cost(150);
 3289   ins_encode %{
 3290     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3291   %}
 3292   ins_pipe(pipe_slow);
 3293 %}
 3294 
 3295 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3296   predicate(UseAVX > 0);
 3297   match(Set dst (MulD src con));
 3298 
 3299   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3300   ins_cost(150);
 3301   ins_encode %{
 3302     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3303   %}
 3304   ins_pipe(pipe_slow);
 3305 %}
 3306 
 3307 instruct divF_reg(regF dst, regF src) %{
 3308   predicate(UseAVX == 0);
 3309   match(Set dst (DivF dst src));
 3310 
 3311   format %{ "divss   $dst, $src" %}
 3312   ins_cost(150);
 3313   ins_encode %{
 3314     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3315   %}
 3316   ins_pipe(pipe_slow);
 3317 %}
 3318 
 3319 instruct divF_mem(regF dst, memory src) %{
 3320   predicate(UseAVX == 0);
 3321   match(Set dst (DivF dst (LoadF src)));
 3322 
 3323   format %{ "divss   $dst, $src" %}
 3324   ins_cost(150);
 3325   ins_encode %{
 3326     __ divss($dst$$XMMRegister, $src$$Address);
 3327   %}
 3328   ins_pipe(pipe_slow);
 3329 %}
 3330 
 3331 instruct divF_imm(regF dst, immF con) %{
 3332   predicate(UseAVX == 0);
 3333   match(Set dst (DivF dst con));
 3334   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3335   ins_cost(150);
 3336   ins_encode %{
 3337     __ divss($dst$$XMMRegister, $constantaddress($con));
 3338   %}
 3339   ins_pipe(pipe_slow);
 3340 %}
 3341 
 3342 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3343   predicate(UseAVX > 0);
 3344   match(Set dst (DivF src1 src2));
 3345 
 3346   format %{ "vdivss  $dst, $src1, $src2" %}
 3347   ins_cost(150);
 3348   ins_encode %{
 3349     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3350   %}
 3351   ins_pipe(pipe_slow);
 3352 %}
 3353 
 3354 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3355   predicate(UseAVX > 0);
 3356   match(Set dst (DivF src1 (LoadF src2)));
 3357 
 3358   format %{ "vdivss  $dst, $src1, $src2" %}
 3359   ins_cost(150);
 3360   ins_encode %{
 3361     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3362   %}
 3363   ins_pipe(pipe_slow);
 3364 %}
 3365 
 3366 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3367   predicate(UseAVX > 0);
 3368   match(Set dst (DivF src con));
 3369 
 3370   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3371   ins_cost(150);
 3372   ins_encode %{
 3373     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3374   %}
 3375   ins_pipe(pipe_slow);
 3376 %}
 3377 
 3378 instruct divD_reg(regD dst, regD src) %{
 3379   predicate(UseAVX == 0);
 3380   match(Set dst (DivD dst src));
 3381 
 3382   format %{ "divsd   $dst, $src" %}
 3383   ins_cost(150);
 3384   ins_encode %{
 3385     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3386   %}
 3387   ins_pipe(pipe_slow);
 3388 %}
 3389 
 3390 instruct divD_mem(regD dst, memory src) %{
 3391   predicate(UseAVX == 0);
 3392   match(Set dst (DivD dst (LoadD src)));
 3393 
 3394   format %{ "divsd   $dst, $src" %}
 3395   ins_cost(150);
 3396   ins_encode %{
 3397     __ divsd($dst$$XMMRegister, $src$$Address);
 3398   %}
 3399   ins_pipe(pipe_slow);
 3400 %}
 3401 
 3402 instruct divD_imm(regD dst, immD con) %{
 3403   predicate(UseAVX == 0);
 3404   match(Set dst (DivD dst con));
 3405   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3406   ins_cost(150);
 3407   ins_encode %{
 3408     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3409   %}
 3410   ins_pipe(pipe_slow);
 3411 %}
 3412 
 3413 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3414   predicate(UseAVX > 0);
 3415   match(Set dst (DivD src1 src2));
 3416 
 3417   format %{ "vdivsd  $dst, $src1, $src2" %}
 3418   ins_cost(150);
 3419   ins_encode %{
 3420     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3421   %}
 3422   ins_pipe(pipe_slow);
 3423 %}
 3424 
 3425 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3426   predicate(UseAVX > 0);
 3427   match(Set dst (DivD src1 (LoadD src2)));
 3428 
 3429   format %{ "vdivsd  $dst, $src1, $src2" %}
 3430   ins_cost(150);
 3431   ins_encode %{
 3432     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3433   %}
 3434   ins_pipe(pipe_slow);
 3435 %}
 3436 
 3437 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3438   predicate(UseAVX > 0);
 3439   match(Set dst (DivD src con));
 3440 
 3441   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3442   ins_cost(150);
 3443   ins_encode %{
 3444     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3445   %}
 3446   ins_pipe(pipe_slow);
 3447 %}
 3448 
 3449 instruct absF_reg(regF dst) %{
 3450   predicate(UseAVX == 0);
 3451   match(Set dst (AbsF dst));
 3452   ins_cost(150);
 3453   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3454   ins_encode %{
 3455     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3456   %}
 3457   ins_pipe(pipe_slow);
 3458 %}
 3459 
 3460 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3461   predicate(UseAVX > 0);
 3462   match(Set dst (AbsF src));
 3463   ins_cost(150);
 3464   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3465   ins_encode %{
 3466     int vlen_enc = Assembler::AVX_128bit;
 3467     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3468               ExternalAddress(float_signmask()), vlen_enc);
 3469   %}
 3470   ins_pipe(pipe_slow);
 3471 %}
 3472 
 3473 instruct absD_reg(regD dst) %{
 3474   predicate(UseAVX == 0);
 3475   match(Set dst (AbsD dst));
 3476   ins_cost(150);
 3477   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3478             "# abs double by sign masking" %}
 3479   ins_encode %{
 3480     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3481   %}
 3482   ins_pipe(pipe_slow);
 3483 %}
 3484 
 3485 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3486   predicate(UseAVX > 0);
 3487   match(Set dst (AbsD src));
 3488   ins_cost(150);
 3489   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3490             "# abs double by sign masking" %}
 3491   ins_encode %{
 3492     int vlen_enc = Assembler::AVX_128bit;
 3493     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3494               ExternalAddress(double_signmask()), vlen_enc);
 3495   %}
 3496   ins_pipe(pipe_slow);
 3497 %}
 3498 
 3499 instruct negF_reg(regF dst) %{
 3500   predicate(UseAVX == 0);
 3501   match(Set dst (NegF dst));
 3502   ins_cost(150);
 3503   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3504   ins_encode %{
 3505     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3506   %}
 3507   ins_pipe(pipe_slow);
 3508 %}
 3509 
 3510 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3511   predicate(UseAVX > 0);
 3512   match(Set dst (NegF src));
 3513   ins_cost(150);
 3514   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3515   ins_encode %{
 3516     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3517                  ExternalAddress(float_signflip()));
 3518   %}
 3519   ins_pipe(pipe_slow);
 3520 %}
 3521 
 3522 instruct negD_reg(regD dst) %{
 3523   predicate(UseAVX == 0);
 3524   match(Set dst (NegD dst));
 3525   ins_cost(150);
 3526   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3527             "# neg double by sign flipping" %}
 3528   ins_encode %{
 3529     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3530   %}
 3531   ins_pipe(pipe_slow);
 3532 %}
 3533 
 3534 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3535   predicate(UseAVX > 0);
 3536   match(Set dst (NegD src));
 3537   ins_cost(150);
 3538   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3539             "# neg double by sign flipping" %}
 3540   ins_encode %{
 3541     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3542                  ExternalAddress(double_signflip()));
 3543   %}
 3544   ins_pipe(pipe_slow);
 3545 %}
 3546 
 3547 // sqrtss instruction needs destination register to be pre initialized for best performance
 3548 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3549 instruct sqrtF_reg(regF dst) %{
 3550   match(Set dst (SqrtF dst));
 3551   format %{ "sqrtss  $dst, $dst" %}
 3552   ins_encode %{
 3553     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3554   %}
 3555   ins_pipe(pipe_slow);
 3556 %}
 3557 
 3558 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3559 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3560 instruct sqrtD_reg(regD dst) %{
 3561   match(Set dst (SqrtD dst));
 3562   format %{ "sqrtsd  $dst, $dst" %}
 3563   ins_encode %{
 3564     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3565   %}
 3566   ins_pipe(pipe_slow);
 3567 %}
 3568 
 3569 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3570   effect(TEMP tmp);
 3571   match(Set dst (ConvF2HF src));
 3572   ins_cost(125);
 3573   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3574   ins_encode %{
 3575     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3576   %}
 3577   ins_pipe( pipe_slow );
 3578 %}
 3579 
 3580 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3581   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3582   effect(TEMP ktmp, TEMP rtmp);
 3583   match(Set mem (StoreC mem (ConvF2HF src)));
 3584   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3585   ins_encode %{
 3586     __ movl($rtmp$$Register, 0x1);
 3587     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3588     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3589   %}
 3590   ins_pipe( pipe_slow );
 3591 %}
 3592 
 3593 instruct vconvF2HF(vec dst, vec src) %{
 3594   match(Set dst (VectorCastF2HF src));
 3595   format %{ "vector_conv_F2HF $dst $src" %}
 3596   ins_encode %{
 3597     int vlen_enc = vector_length_encoding(this, $src);
 3598     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3599   %}
 3600   ins_pipe( pipe_slow );
 3601 %}
 3602 
 3603 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3604   predicate(n->as_StoreVector()->memory_size() >= 16);
 3605   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3606   format %{ "vcvtps2ph $mem,$src" %}
 3607   ins_encode %{
 3608     int vlen_enc = vector_length_encoding(this, $src);
 3609     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3610   %}
 3611   ins_pipe( pipe_slow );
 3612 %}
 3613 
 3614 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3615   match(Set dst (ConvHF2F src));
 3616   format %{ "vcvtph2ps $dst,$src" %}
 3617   ins_encode %{
 3618     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3619   %}
 3620   ins_pipe( pipe_slow );
 3621 %}
 3622 
 3623 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3624   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3625   format %{ "vcvtph2ps $dst,$mem" %}
 3626   ins_encode %{
 3627     int vlen_enc = vector_length_encoding(this);
 3628     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3629   %}
 3630   ins_pipe( pipe_slow );
 3631 %}
 3632 
 3633 instruct vconvHF2F(vec dst, vec src) %{
 3634   match(Set dst (VectorCastHF2F src));
 3635   ins_cost(125);
 3636   format %{ "vector_conv_HF2F $dst,$src" %}
 3637   ins_encode %{
 3638     int vlen_enc = vector_length_encoding(this);
 3639     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3640   %}
 3641   ins_pipe( pipe_slow );
 3642 %}
 3643 
 3644 // ---------------------------------------- VectorReinterpret ------------------------------------
 3645 instruct reinterpret_mask(kReg dst) %{
 3646   predicate(n->bottom_type()->isa_vectmask() &&
 3647             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3648   match(Set dst (VectorReinterpret dst));
 3649   ins_cost(125);
 3650   format %{ "vector_reinterpret $dst\t!" %}
 3651   ins_encode %{
 3652     // empty
 3653   %}
 3654   ins_pipe( pipe_slow );
 3655 %}
 3656 
 3657 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3658   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3659             n->bottom_type()->isa_vectmask() &&
 3660             n->in(1)->bottom_type()->isa_vectmask() &&
 3661             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3662             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3663   match(Set dst (VectorReinterpret src));
 3664   effect(TEMP xtmp);
 3665   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3666   ins_encode %{
 3667      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3668      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3669      assert(src_sz == dst_sz , "src and dst size mismatch");
 3670      int vlen_enc = vector_length_encoding(src_sz);
 3671      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3672      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3673   %}
 3674   ins_pipe( pipe_slow );
 3675 %}
 3676 
 3677 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3678   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3679             n->bottom_type()->isa_vectmask() &&
 3680             n->in(1)->bottom_type()->isa_vectmask() &&
 3681             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3682              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3683             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3684   match(Set dst (VectorReinterpret src));
 3685   effect(TEMP xtmp);
 3686   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3687   ins_encode %{
 3688      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3689      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3690      assert(src_sz == dst_sz , "src and dst size mismatch");
 3691      int vlen_enc = vector_length_encoding(src_sz);
 3692      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3693      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3694   %}
 3695   ins_pipe( pipe_slow );
 3696 %}
 3697 
 3698 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3699   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3700             n->bottom_type()->isa_vectmask() &&
 3701             n->in(1)->bottom_type()->isa_vectmask() &&
 3702             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3703              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3704             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3705   match(Set dst (VectorReinterpret src));
 3706   effect(TEMP xtmp);
 3707   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3708   ins_encode %{
 3709      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3710      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3711      assert(src_sz == dst_sz , "src and dst size mismatch");
 3712      int vlen_enc = vector_length_encoding(src_sz);
 3713      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3714      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3715   %}
 3716   ins_pipe( pipe_slow );
 3717 %}
 3718 
 3719 instruct reinterpret(vec dst) %{
 3720   predicate(!n->bottom_type()->isa_vectmask() &&
 3721             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3722   match(Set dst (VectorReinterpret dst));
 3723   ins_cost(125);
 3724   format %{ "vector_reinterpret $dst\t!" %}
 3725   ins_encode %{
 3726     // empty
 3727   %}
 3728   ins_pipe( pipe_slow );
 3729 %}
 3730 
 3731 instruct reinterpret_expand(vec dst, vec src) %{
 3732   predicate(UseAVX == 0 &&
 3733             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3734   match(Set dst (VectorReinterpret src));
 3735   ins_cost(125);
 3736   effect(TEMP dst);
 3737   format %{ "vector_reinterpret_expand $dst,$src" %}
 3738   ins_encode %{
 3739     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3740     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3741 
 3742     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3743     if (src_vlen_in_bytes == 4) {
 3744       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3745     } else {
 3746       assert(src_vlen_in_bytes == 8, "");
 3747       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3748     }
 3749     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3750   %}
 3751   ins_pipe( pipe_slow );
 3752 %}
 3753 
 3754 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3755   predicate(UseAVX > 0 &&
 3756             !n->bottom_type()->isa_vectmask() &&
 3757             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3758             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3759   match(Set dst (VectorReinterpret src));
 3760   ins_cost(125);
 3761   format %{ "vector_reinterpret_expand $dst,$src" %}
 3762   ins_encode %{
 3763     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3764   %}
 3765   ins_pipe( pipe_slow );
 3766 %}
 3767 
 3768 
 3769 instruct vreinterpret_expand(legVec dst, vec src) %{
 3770   predicate(UseAVX > 0 &&
 3771             !n->bottom_type()->isa_vectmask() &&
 3772             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3773             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3774   match(Set dst (VectorReinterpret src));
 3775   ins_cost(125);
 3776   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3777   ins_encode %{
 3778     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3779       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3780       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3781       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3782       default: ShouldNotReachHere();
 3783     }
 3784   %}
 3785   ins_pipe( pipe_slow );
 3786 %}
 3787 
 3788 instruct reinterpret_shrink(vec dst, legVec src) %{
 3789   predicate(!n->bottom_type()->isa_vectmask() &&
 3790             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3791   match(Set dst (VectorReinterpret src));
 3792   ins_cost(125);
 3793   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3794   ins_encode %{
 3795     switch (Matcher::vector_length_in_bytes(this)) {
 3796       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3797       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3798       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3799       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3800       default: ShouldNotReachHere();
 3801     }
 3802   %}
 3803   ins_pipe( pipe_slow );
 3804 %}
 3805 
 3806 // ----------------------------------------------------------------------------------------------------
 3807 
 3808 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3809   match(Set dst (RoundDoubleMode src rmode));
 3810   format %{ "roundsd $dst,$src" %}
 3811   ins_cost(150);
 3812   ins_encode %{
 3813     assert(UseSSE >= 4, "required");
 3814     if ((UseAVX == 0) && ($dst$$XMMRegister != $src$$XMMRegister)) {
 3815       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 3816     }
 3817     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3818   %}
 3819   ins_pipe(pipe_slow);
 3820 %}
 3821 
 3822 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3823   match(Set dst (RoundDoubleMode con rmode));
 3824   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3825   ins_cost(150);
 3826   ins_encode %{
 3827     assert(UseSSE >= 4, "required");
 3828     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3829   %}
 3830   ins_pipe(pipe_slow);
 3831 %}
 3832 
 3833 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3834   predicate(Matcher::vector_length(n) < 8);
 3835   match(Set dst (RoundDoubleModeV src rmode));
 3836   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3837   ins_encode %{
 3838     assert(UseAVX > 0, "required");
 3839     int vlen_enc = vector_length_encoding(this);
 3840     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3841   %}
 3842   ins_pipe( pipe_slow );
 3843 %}
 3844 
 3845 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3846   predicate(Matcher::vector_length(n) == 8);
 3847   match(Set dst (RoundDoubleModeV src rmode));
 3848   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3849   ins_encode %{
 3850     assert(UseAVX > 2, "required");
 3851     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3852   %}
 3853   ins_pipe( pipe_slow );
 3854 %}
 3855 
 3856 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3857   predicate(Matcher::vector_length(n) < 8);
 3858   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3859   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3860   ins_encode %{
 3861     assert(UseAVX > 0, "required");
 3862     int vlen_enc = vector_length_encoding(this);
 3863     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3864   %}
 3865   ins_pipe( pipe_slow );
 3866 %}
 3867 
 3868 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3869   predicate(Matcher::vector_length(n) == 8);
 3870   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3871   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3872   ins_encode %{
 3873     assert(UseAVX > 2, "required");
 3874     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3875   %}
 3876   ins_pipe( pipe_slow );
 3877 %}
 3878 
 3879 instruct onspinwait() %{
 3880   match(OnSpinWait);
 3881   ins_cost(200);
 3882 
 3883   format %{
 3884     $$template
 3885     $$emit$$"pause\t! membar_onspinwait"
 3886   %}
 3887   ins_encode %{
 3888     __ pause();
 3889   %}
 3890   ins_pipe(pipe_slow);
 3891 %}
 3892 
 3893 // a * b + c
 3894 instruct fmaD_reg(regD a, regD b, regD c) %{
 3895   match(Set c (FmaD  c (Binary a b)));
 3896   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3897   ins_cost(150);
 3898   ins_encode %{
 3899     assert(UseFMA, "Needs FMA instructions support.");
 3900     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3901   %}
 3902   ins_pipe( pipe_slow );
 3903 %}
 3904 
 3905 // a * b + c
 3906 instruct fmaF_reg(regF a, regF b, regF c) %{
 3907   match(Set c (FmaF  c (Binary a b)));
 3908   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3909   ins_cost(150);
 3910   ins_encode %{
 3911     assert(UseFMA, "Needs FMA instructions support.");
 3912     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3913   %}
 3914   ins_pipe( pipe_slow );
 3915 %}
 3916 
 3917 // ====================VECTOR INSTRUCTIONS=====================================
 3918 
 3919 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3920 instruct MoveVec2Leg(legVec dst, vec src) %{
 3921   match(Set dst src);
 3922   format %{ "" %}
 3923   ins_encode %{
 3924     ShouldNotReachHere();
 3925   %}
 3926   ins_pipe( fpu_reg_reg );
 3927 %}
 3928 
 3929 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3930   match(Set dst src);
 3931   format %{ "" %}
 3932   ins_encode %{
 3933     ShouldNotReachHere();
 3934   %}
 3935   ins_pipe( fpu_reg_reg );
 3936 %}
 3937 
 3938 // ============================================================================
 3939 
 3940 // Load vectors generic operand pattern
 3941 instruct loadV(vec dst, memory mem) %{
 3942   match(Set dst (LoadVector mem));
 3943   ins_cost(125);
 3944   format %{ "load_vector $dst,$mem" %}
 3945   ins_encode %{
 3946     BasicType bt = Matcher::vector_element_basic_type(this);
 3947     __ load_vector(bt, $dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3948   %}
 3949   ins_pipe( pipe_slow );
 3950 %}
 3951 
 3952 // Store vectors generic operand pattern.
 3953 instruct storeV(memory mem, vec src) %{
 3954   match(Set mem (StoreVector mem src));
 3955   ins_cost(145);
 3956   format %{ "store_vector $mem,$src\n\t" %}
 3957   ins_encode %{
 3958     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3959       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3960       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3961       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3962       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3963       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3964       default: ShouldNotReachHere();
 3965     }
 3966   %}
 3967   ins_pipe( pipe_slow );
 3968 %}
 3969 
 3970 // ---------------------------------------- Gather ------------------------------------
 3971 
 3972 // Gather BYTE, SHORT, INT, LONG, FLOAT, DOUBLE
 3973 
 3974 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 3975   predicate(!VM_Version::supports_avx512vl() && !is_subword_type(Matcher::vector_element_basic_type(n)) &&
 3976             Matcher::vector_length_in_bytes(n) <= 32);
 3977   match(Set dst (LoadVectorGather mem idx));
 3978   effect(TEMP dst, TEMP tmp, TEMP mask);
 3979   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 3980   ins_encode %{
 3981     int vlen_enc = vector_length_encoding(this);
 3982     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3983     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 3984     __ vpcmpeqd($mask$$XMMRegister, $mask$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3985     __ lea($tmp$$Register, $mem$$Address);
 3986     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3987   %}
 3988   ins_pipe( pipe_slow );
 3989 %}
 3990 
 3991 
 3992 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 3993   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 3994             !is_subword_type(Matcher::vector_element_basic_type(n)));
 3995   match(Set dst (LoadVectorGather mem idx));
 3996   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 3997   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 3998   ins_encode %{
 3999     int vlen_enc = vector_length_encoding(this);
 4000     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4001     __ kxnorwl($ktmp$$KRegister, $ktmp$$KRegister, $ktmp$$KRegister);
 4002     __ lea($tmp$$Register, $mem$$Address);
 4003     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4004   %}
 4005   ins_pipe( pipe_slow );
 4006 %}
 4007 
 4008 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4009   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64) &&
 4010             !is_subword_type(Matcher::vector_element_basic_type(n)));
 4011   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4012   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4013   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4014   ins_encode %{
 4015     assert(UseAVX > 2, "sanity");
 4016     int vlen_enc = vector_length_encoding(this);
 4017     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4018     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4019     // Note: Since gather instruction partially updates the opmask register used
 4020     // for predication hense moving mask operand to a temporary.
 4021     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4022     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4023     __ lea($tmp$$Register, $mem$$Address);
 4024     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4025   %}
 4026   ins_pipe( pipe_slow );
 4027 %}
 4028 
 4029 instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegI rtmp) %{
 4030   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4031   match(Set dst (LoadVectorGather mem idx_base));
 4032   effect(TEMP tmp, TEMP rtmp);
 4033   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
 4034   ins_encode %{
 4035     int vlen_enc = vector_length_encoding(this);
 4036     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4037     __ lea($tmp$$Register, $mem$$Address);
 4038     __ vgather8b(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp$$Register, vlen_enc);
 4039   %}
 4040   ins_pipe( pipe_slow );
 4041 %}
 4042 
 4043 instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegP idx_base_temp,
 4044                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
 4045   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4046   match(Set dst (LoadVectorGather mem idx_base));
 4047   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
 4048   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
 4049   ins_encode %{
 4050     int vlen_enc = vector_length_encoding(this);
 4051     int vector_len = Matcher::vector_length(this);
 4052     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4053     __ lea($tmp$$Register, $mem$$Address);
 4054     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4055     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $xtmp1$$XMMRegister,
 4056                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
 4057   %}
 4058   ins_pipe( pipe_slow );
 4059 %}
 4060 
 4061 instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
 4062   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4063   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4064   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4065   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4066   ins_encode %{
 4067     int vlen_enc = vector_length_encoding(this);
 4068     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4069     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4070     __ lea($tmp$$Register, $mem$$Address);
 4071     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4072     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4073   %}
 4074   ins_pipe( pipe_slow );
 4075 %}
 4076 
 4077 instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegP tmp, rRegP idx_base_temp,
 4078                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
 4079   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4080   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4081   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4082   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4083   ins_encode %{
 4084     int vlen_enc = vector_length_encoding(this);
 4085     int vector_len = Matcher::vector_length(this);
 4086     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4087     __ xorq($mask_idx$$Register, $mask_idx$$Register);
 4088     __ lea($tmp$$Register, $mem$$Address);
 4089     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4090     __ kmovql($rtmp2$$Register, $mask$$KRegister);
 4091     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4092                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4093   %}
 4094   ins_pipe( pipe_slow );
 4095 %}
 4096 
 4097 instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
 4098   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
 4099   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4100   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
 4101   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
 4102   ins_encode %{
 4103     int vlen_enc = vector_length_encoding(this);
 4104     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4105     __ lea($tmp$$Register, $mem$$Address);
 4106     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4107     if (elem_bt == T_SHORT) {
 4108       __ movl($mask_idx$$Register, 0x55555555);
 4109       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4110     }
 4111     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4112     __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
 4113   %}
 4114   ins_pipe( pipe_slow );
 4115 %}
 4116 
 4117 instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegP tmp, rRegP idx_base_temp,
 4118                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
 4119   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
 4120   match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
 4121   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
 4122   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
 4123   ins_encode %{
 4124     int vlen_enc = vector_length_encoding(this);
 4125     int vector_len = Matcher::vector_length(this);
 4126     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4127     __ lea($tmp$$Register, $mem$$Address);
 4128     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
 4129     __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
 4130     if (elem_bt == T_SHORT) {
 4131       __ movl($mask_idx$$Register, 0x55555555);
 4132       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
 4133     }
 4134     __ xorl($mask_idx$$Register, $mask_idx$$Register);
 4135     __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
 4136                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
 4137   %}
 4138   ins_pipe( pipe_slow );
 4139 %}
 4140 
 4141 // ====================Scatter=======================================
 4142 
 4143 // Scatter INT, LONG, FLOAT, DOUBLE
 4144 
 4145 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4146   predicate(UseAVX > 2);
 4147   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4148   effect(TEMP tmp, TEMP ktmp);
 4149   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4150   ins_encode %{
 4151     int vlen_enc = vector_length_encoding(this, $src);
 4152     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4153 
 4154     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4155     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4156 
 4157     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4158     __ lea($tmp$$Register, $mem$$Address);
 4159     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4160   %}
 4161   ins_pipe( pipe_slow );
 4162 %}
 4163 
 4164 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4165   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4166   effect(TEMP tmp, TEMP ktmp);
 4167   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4168   ins_encode %{
 4169     int vlen_enc = vector_length_encoding(this, $src);
 4170     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4171     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4172     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4173     // Note: Since scatter instruction partially updates the opmask register used
 4174     // for predication hense moving mask operand to a temporary.
 4175     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4176     __ lea($tmp$$Register, $mem$$Address);
 4177     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4178   %}
 4179   ins_pipe( pipe_slow );
 4180 %}
 4181 
 4182 // ====================REPLICATE=======================================
 4183 
 4184 // Replicate byte scalar to be vector
 4185 instruct vReplB_reg(vec dst, rRegI src) %{
 4186   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4187   match(Set dst (Replicate src));
 4188   format %{ "replicateB $dst,$src" %}
 4189   ins_encode %{
 4190     uint vlen = Matcher::vector_length(this);
 4191     if (UseAVX >= 2) {
 4192       int vlen_enc = vector_length_encoding(this);
 4193       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4194         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4195         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4196       } else {
 4197         __ movdl($dst$$XMMRegister, $src$$Register);
 4198         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4199       }
 4200     } else {
 4201        assert(UseAVX < 2, "");
 4202       __ movdl($dst$$XMMRegister, $src$$Register);
 4203       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4204       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4205       if (vlen >= 16) {
 4206         assert(vlen == 16, "");
 4207         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4208       }
 4209     }
 4210   %}
 4211   ins_pipe( pipe_slow );
 4212 %}
 4213 
 4214 instruct ReplB_mem(vec dst, memory mem) %{
 4215   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4216   match(Set dst (Replicate (LoadB mem)));
 4217   format %{ "replicateB $dst,$mem" %}
 4218   ins_encode %{
 4219     int vlen_enc = vector_length_encoding(this);
 4220     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4221   %}
 4222   ins_pipe( pipe_slow );
 4223 %}
 4224 
 4225 // ====================ReplicateS=======================================
 4226 
 4227 instruct vReplS_reg(vec dst, rRegI src) %{
 4228   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4229   match(Set dst (Replicate src));
 4230   format %{ "replicateS $dst,$src" %}
 4231   ins_encode %{
 4232     uint vlen = Matcher::vector_length(this);
 4233     int vlen_enc = vector_length_encoding(this);
 4234     if (UseAVX >= 2) {
 4235       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4236         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4237         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4238       } else {
 4239         __ movdl($dst$$XMMRegister, $src$$Register);
 4240         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4241       }
 4242     } else {
 4243       assert(UseAVX < 2, "");
 4244       __ movdl($dst$$XMMRegister, $src$$Register);
 4245       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4246       if (vlen >= 8) {
 4247         assert(vlen == 8, "");
 4248         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4249       }
 4250     }
 4251   %}
 4252   ins_pipe( pipe_slow );
 4253 %}
 4254 
 4255 instruct ReplHF_imm(vec dst, immH con, rRegI rtmp) %{
 4256   match(Set dst (Replicate con));
 4257   effect(TEMP rtmp);
 4258   format %{ "replicateHF $dst, $con \t! using $rtmp as TEMP" %}
 4259   ins_encode %{
 4260     int vlen_enc = vector_length_encoding(this);
 4261     BasicType bt = Matcher::vector_element_basic_type(this);
 4262     assert(VM_Version::supports_avx512_fp16() && bt == T_SHORT, "");
 4263     __ movl($rtmp$$Register, $con$$constant);
 4264     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4265   %}
 4266   ins_pipe( pipe_slow );
 4267 %}
 4268 
 4269 instruct ReplHF_reg(vec dst, regF src, rRegI rtmp) %{
 4270   predicate(VM_Version::supports_avx512_fp16() && Matcher::vector_element_basic_type(n) == T_SHORT);
 4271   match(Set dst (Replicate src));
 4272   effect(TEMP rtmp);
 4273   format %{ "replicateHF $dst, $src \t! using $rtmp as TEMP" %}
 4274   ins_encode %{
 4275     int vlen_enc = vector_length_encoding(this);
 4276     __ vmovw($rtmp$$Register, $src$$XMMRegister);
 4277     __ evpbroadcastw($dst$$XMMRegister, $rtmp$$Register, vlen_enc);
 4278   %}
 4279   ins_pipe( pipe_slow );
 4280 %}
 4281 
 4282 instruct ReplS_mem(vec dst, memory mem) %{
 4283   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4284   match(Set dst (Replicate (LoadS mem)));
 4285   format %{ "replicateS $dst,$mem" %}
 4286   ins_encode %{
 4287     int vlen_enc = vector_length_encoding(this);
 4288     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4289   %}
 4290   ins_pipe( pipe_slow );
 4291 %}
 4292 
 4293 // ====================ReplicateI=======================================
 4294 
 4295 instruct ReplI_reg(vec dst, rRegI src) %{
 4296   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4297   match(Set dst (Replicate src));
 4298   format %{ "replicateI $dst,$src" %}
 4299   ins_encode %{
 4300     uint vlen = Matcher::vector_length(this);
 4301     int vlen_enc = vector_length_encoding(this);
 4302     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4303       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4304     } else if (VM_Version::supports_avx2()) {
 4305       __ movdl($dst$$XMMRegister, $src$$Register);
 4306       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4307     } else {
 4308       __ movdl($dst$$XMMRegister, $src$$Register);
 4309       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4310     }
 4311   %}
 4312   ins_pipe( pipe_slow );
 4313 %}
 4314 
 4315 instruct ReplI_mem(vec dst, memory mem) %{
 4316   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4317   match(Set dst (Replicate (LoadI mem)));
 4318   format %{ "replicateI $dst,$mem" %}
 4319   ins_encode %{
 4320     int vlen_enc = vector_length_encoding(this);
 4321     if (VM_Version::supports_avx2()) {
 4322       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4323     } else if (VM_Version::supports_avx()) {
 4324       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4325     } else {
 4326       __ movdl($dst$$XMMRegister, $mem$$Address);
 4327       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4328     }
 4329   %}
 4330   ins_pipe( pipe_slow );
 4331 %}
 4332 
 4333 instruct ReplI_imm(vec dst, immI con) %{
 4334   predicate(Matcher::is_non_long_integral_vector(n));
 4335   match(Set dst (Replicate con));
 4336   format %{ "replicateI $dst,$con" %}
 4337   ins_encode %{
 4338     InternalAddress addr = $constantaddress(vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4339                                                            (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 16) /
 4340                                                                    type2aelembytes(Matcher::vector_element_basic_type(this))));
 4341     BasicType bt = Matcher::vector_element_basic_type(this);
 4342     int vlen = Matcher::vector_length_in_bytes(this);
 4343     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4344   %}
 4345   ins_pipe( pipe_slow );
 4346 %}
 4347 
 4348 // Replicate scalar zero to be vector
 4349 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4350   predicate(Matcher::is_non_long_integral_vector(n));
 4351   match(Set dst (Replicate zero));
 4352   format %{ "replicateI $dst,$zero" %}
 4353   ins_encode %{
 4354     int vlen_enc = vector_length_encoding(this);
 4355     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4356       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4357     } else {
 4358       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4359     }
 4360   %}
 4361   ins_pipe( fpu_reg_reg );
 4362 %}
 4363 
 4364 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4365   predicate(Matcher::is_non_long_integral_vector(n));
 4366   match(Set dst (Replicate con));
 4367   format %{ "vallones $dst" %}
 4368   ins_encode %{
 4369     int vector_len = vector_length_encoding(this);
 4370     __ vallones($dst$$XMMRegister, vector_len);
 4371   %}
 4372   ins_pipe( pipe_slow );
 4373 %}
 4374 
 4375 // ====================ReplicateL=======================================
 4376 
 4377 // Replicate long (8 byte) scalar to be vector
 4378 instruct ReplL_reg(vec dst, rRegL src) %{
 4379   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4380   match(Set dst (Replicate src));
 4381   format %{ "replicateL $dst,$src" %}
 4382   ins_encode %{
 4383     int vlen = Matcher::vector_length(this);
 4384     int vlen_enc = vector_length_encoding(this);
 4385     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4386       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4387     } else if (VM_Version::supports_avx2()) {
 4388       __ movdq($dst$$XMMRegister, $src$$Register);
 4389       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4390     } else {
 4391       __ movdq($dst$$XMMRegister, $src$$Register);
 4392       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4393     }
 4394   %}
 4395   ins_pipe( pipe_slow );
 4396 %}
 4397 
 4398 instruct ReplL_mem(vec dst, memory mem) %{
 4399   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4400   match(Set dst (Replicate (LoadL mem)));
 4401   format %{ "replicateL $dst,$mem" %}
 4402   ins_encode %{
 4403     int vlen_enc = vector_length_encoding(this);
 4404     if (VM_Version::supports_avx2()) {
 4405       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4406     } else if (VM_Version::supports_sse3()) {
 4407       __ movddup($dst$$XMMRegister, $mem$$Address);
 4408     } else {
 4409       __ movq($dst$$XMMRegister, $mem$$Address);
 4410       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4411     }
 4412   %}
 4413   ins_pipe( pipe_slow );
 4414 %}
 4415 
 4416 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4417 instruct ReplL_imm(vec dst, immL con) %{
 4418   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4419   match(Set dst (Replicate con));
 4420   format %{ "replicateL $dst,$con" %}
 4421   ins_encode %{
 4422     InternalAddress addr = $constantaddress(vreplicate_imm(T_LONG, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4423     int vlen = Matcher::vector_length_in_bytes(this);
 4424     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4425   %}
 4426   ins_pipe( pipe_slow );
 4427 %}
 4428 
 4429 instruct ReplL_zero(vec dst, immL0 zero) %{
 4430   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4431   match(Set dst (Replicate zero));
 4432   format %{ "replicateL $dst,$zero" %}
 4433   ins_encode %{
 4434     int vlen_enc = vector_length_encoding(this);
 4435     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4436       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4437     } else {
 4438       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4439     }
 4440   %}
 4441   ins_pipe( fpu_reg_reg );
 4442 %}
 4443 
 4444 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4445   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4446   match(Set dst (Replicate con));
 4447   format %{ "vallones $dst" %}
 4448   ins_encode %{
 4449     int vector_len = vector_length_encoding(this);
 4450     __ vallones($dst$$XMMRegister, vector_len);
 4451   %}
 4452   ins_pipe( pipe_slow );
 4453 %}
 4454 
 4455 // ====================ReplicateF=======================================
 4456 
 4457 instruct vReplF_reg(vec dst, vlRegF src) %{
 4458   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4459   match(Set dst (Replicate src));
 4460   format %{ "replicateF $dst,$src" %}
 4461   ins_encode %{
 4462     uint vlen = Matcher::vector_length(this);
 4463     int vlen_enc = vector_length_encoding(this);
 4464     if (vlen <= 4) {
 4465       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4466     } else if (VM_Version::supports_avx2()) {
 4467       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4468     } else {
 4469       assert(vlen == 8, "sanity");
 4470       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4471       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4472     }
 4473   %}
 4474   ins_pipe( pipe_slow );
 4475 %}
 4476 
 4477 instruct ReplF_reg(vec dst, vlRegF src) %{
 4478   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4479   match(Set dst (Replicate src));
 4480   format %{ "replicateF $dst,$src" %}
 4481   ins_encode %{
 4482     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4483   %}
 4484   ins_pipe( pipe_slow );
 4485 %}
 4486 
 4487 instruct ReplF_mem(vec dst, memory mem) %{
 4488   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4489   match(Set dst (Replicate (LoadF mem)));
 4490   format %{ "replicateF $dst,$mem" %}
 4491   ins_encode %{
 4492     int vlen_enc = vector_length_encoding(this);
 4493     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4494   %}
 4495   ins_pipe( pipe_slow );
 4496 %}
 4497 
 4498 // Replicate float scalar immediate to be vector by loading from const table.
 4499 instruct ReplF_imm(vec dst, immF con) %{
 4500   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4501   match(Set dst (Replicate con));
 4502   format %{ "replicateF $dst,$con" %}
 4503   ins_encode %{
 4504     InternalAddress addr = $constantaddress(vreplicate_imm(T_FLOAT, $con$$constant,
 4505                                                            VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 4));
 4506     int vlen = Matcher::vector_length_in_bytes(this);
 4507     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4508   %}
 4509   ins_pipe( pipe_slow );
 4510 %}
 4511 
 4512 instruct ReplF_zero(vec dst, immF0 zero) %{
 4513   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4514   match(Set dst (Replicate zero));
 4515   format %{ "replicateF $dst,$zero" %}
 4516   ins_encode %{
 4517     int vlen_enc = vector_length_encoding(this);
 4518     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4519       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4520     } else {
 4521       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4522     }
 4523   %}
 4524   ins_pipe( fpu_reg_reg );
 4525 %}
 4526 
 4527 // ====================ReplicateD=======================================
 4528 
 4529 // Replicate double (8 bytes) scalar to be vector
 4530 instruct vReplD_reg(vec dst, vlRegD src) %{
 4531   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4532   match(Set dst (Replicate src));
 4533   format %{ "replicateD $dst,$src" %}
 4534   ins_encode %{
 4535     uint vlen = Matcher::vector_length(this);
 4536     int vlen_enc = vector_length_encoding(this);
 4537     if (vlen <= 2) {
 4538       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4539     } else if (VM_Version::supports_avx2()) {
 4540       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4541     } else {
 4542       assert(vlen == 4, "sanity");
 4543       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4544       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4545     }
 4546   %}
 4547   ins_pipe( pipe_slow );
 4548 %}
 4549 
 4550 instruct ReplD_reg(vec dst, vlRegD src) %{
 4551   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4552   match(Set dst (Replicate src));
 4553   format %{ "replicateD $dst,$src" %}
 4554   ins_encode %{
 4555     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4556   %}
 4557   ins_pipe( pipe_slow );
 4558 %}
 4559 
 4560 instruct ReplD_mem(vec dst, memory mem) %{
 4561   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4562   match(Set dst (Replicate (LoadD mem)));
 4563   format %{ "replicateD $dst,$mem" %}
 4564   ins_encode %{
 4565     if (Matcher::vector_length(this) >= 4) {
 4566       int vlen_enc = vector_length_encoding(this);
 4567       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4568     } else {
 4569       __ movddup($dst$$XMMRegister, $mem$$Address);
 4570     }
 4571   %}
 4572   ins_pipe( pipe_slow );
 4573 %}
 4574 
 4575 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4576 instruct ReplD_imm(vec dst, immD con) %{
 4577   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4578   match(Set dst (Replicate con));
 4579   format %{ "replicateD $dst,$con" %}
 4580   ins_encode %{
 4581     InternalAddress addr = $constantaddress(vreplicate_imm(T_DOUBLE, $con$$constant, VM_Version::supports_sse3() ? 1 : 2));
 4582     int vlen = Matcher::vector_length_in_bytes(this);
 4583     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4584   %}
 4585   ins_pipe( pipe_slow );
 4586 %}
 4587 
 4588 instruct ReplD_zero(vec dst, immD0 zero) %{
 4589   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4590   match(Set dst (Replicate zero));
 4591   format %{ "replicateD $dst,$zero" %}
 4592   ins_encode %{
 4593     int vlen_enc = vector_length_encoding(this);
 4594     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4595       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4596     } else {
 4597       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4598     }
 4599   %}
 4600   ins_pipe( fpu_reg_reg );
 4601 %}
 4602 
 4603 // ====================VECTOR INSERT=======================================
 4604 
 4605 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4606   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4607   match(Set dst (VectorInsert (Binary dst val) idx));
 4608   format %{ "vector_insert $dst,$val,$idx" %}
 4609   ins_encode %{
 4610     assert(UseSSE >= 4, "required");
 4611     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4612 
 4613     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4614 
 4615     assert(is_integral_type(elem_bt), "");
 4616     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4617 
 4618     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4619   %}
 4620   ins_pipe( pipe_slow );
 4621 %}
 4622 
 4623 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4624   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4625   match(Set dst (VectorInsert (Binary src val) idx));
 4626   effect(TEMP vtmp);
 4627   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4628   ins_encode %{
 4629     int vlen_enc = Assembler::AVX_256bit;
 4630     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4631     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4632     int log2epr = log2(elem_per_lane);
 4633 
 4634     assert(is_integral_type(elem_bt), "sanity");
 4635     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4636 
 4637     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4638     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4639     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4640     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4641     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4642   %}
 4643   ins_pipe( pipe_slow );
 4644 %}
 4645 
 4646 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4647   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4648   match(Set dst (VectorInsert (Binary src val) idx));
 4649   effect(TEMP vtmp);
 4650   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4651   ins_encode %{
 4652     assert(UseAVX > 2, "sanity");
 4653 
 4654     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4655     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4656     int log2epr = log2(elem_per_lane);
 4657 
 4658     assert(is_integral_type(elem_bt), "");
 4659     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4660 
 4661     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4662     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4663     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4664     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4665     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4666   %}
 4667   ins_pipe( pipe_slow );
 4668 %}
 4669 
 4670 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4671   predicate(Matcher::vector_length(n) == 2);
 4672   match(Set dst (VectorInsert (Binary dst val) idx));
 4673   format %{ "vector_insert $dst,$val,$idx" %}
 4674   ins_encode %{
 4675     assert(UseSSE >= 4, "required");
 4676     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4677     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4678 
 4679     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4680   %}
 4681   ins_pipe( pipe_slow );
 4682 %}
 4683 
 4684 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4685   predicate(Matcher::vector_length(n) == 4);
 4686   match(Set dst (VectorInsert (Binary src val) idx));
 4687   effect(TEMP vtmp);
 4688   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4689   ins_encode %{
 4690     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4691     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4692 
 4693     uint x_idx = $idx$$constant & right_n_bits(1);
 4694     uint y_idx = ($idx$$constant >> 1) & 1;
 4695     int vlen_enc = Assembler::AVX_256bit;
 4696     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4697     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4698     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4699   %}
 4700   ins_pipe( pipe_slow );
 4701 %}
 4702 
 4703 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4704   predicate(Matcher::vector_length(n) == 8);
 4705   match(Set dst (VectorInsert (Binary src val) idx));
 4706   effect(TEMP vtmp);
 4707   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4708   ins_encode %{
 4709     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4710     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4711 
 4712     uint x_idx = $idx$$constant & right_n_bits(1);
 4713     uint y_idx = ($idx$$constant >> 1) & 3;
 4714     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4715     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4716     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4717   %}
 4718   ins_pipe( pipe_slow );
 4719 %}
 4720 
 4721 instruct insertF(vec dst, regF val, immU8 idx) %{
 4722   predicate(Matcher::vector_length(n) < 8);
 4723   match(Set dst (VectorInsert (Binary dst val) idx));
 4724   format %{ "vector_insert $dst,$val,$idx" %}
 4725   ins_encode %{
 4726     assert(UseSSE >= 4, "sanity");
 4727 
 4728     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4729     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4730 
 4731     uint x_idx = $idx$$constant & right_n_bits(2);
 4732     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4733   %}
 4734   ins_pipe( pipe_slow );
 4735 %}
 4736 
 4737 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4738   predicate(Matcher::vector_length(n) >= 8);
 4739   match(Set dst (VectorInsert (Binary src val) idx));
 4740   effect(TEMP vtmp);
 4741   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4742   ins_encode %{
 4743     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4744     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4745 
 4746     int vlen = Matcher::vector_length(this);
 4747     uint x_idx = $idx$$constant & right_n_bits(2);
 4748     if (vlen == 8) {
 4749       uint y_idx = ($idx$$constant >> 2) & 1;
 4750       int vlen_enc = Assembler::AVX_256bit;
 4751       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4752       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4753       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4754     } else {
 4755       assert(vlen == 16, "sanity");
 4756       uint y_idx = ($idx$$constant >> 2) & 3;
 4757       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4758       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4759       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4760     }
 4761   %}
 4762   ins_pipe( pipe_slow );
 4763 %}
 4764 
 4765 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4766   predicate(Matcher::vector_length(n) == 2);
 4767   match(Set dst (VectorInsert (Binary dst val) idx));
 4768   effect(TEMP tmp);
 4769   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4770   ins_encode %{
 4771     assert(UseSSE >= 4, "sanity");
 4772     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4773     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4774 
 4775     __ movq($tmp$$Register, $val$$XMMRegister);
 4776     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4777   %}
 4778   ins_pipe( pipe_slow );
 4779 %}
 4780 
 4781 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4782   predicate(Matcher::vector_length(n) == 4);
 4783   match(Set dst (VectorInsert (Binary src val) idx));
 4784   effect(TEMP vtmp, TEMP tmp);
 4785   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4786   ins_encode %{
 4787     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4788     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4789 
 4790     uint x_idx = $idx$$constant & right_n_bits(1);
 4791     uint y_idx = ($idx$$constant >> 1) & 1;
 4792     int vlen_enc = Assembler::AVX_256bit;
 4793     __ movq($tmp$$Register, $val$$XMMRegister);
 4794     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4795     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4796     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4797   %}
 4798   ins_pipe( pipe_slow );
 4799 %}
 4800 
 4801 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4802   predicate(Matcher::vector_length(n) == 8);
 4803   match(Set dst (VectorInsert (Binary src val) idx));
 4804   effect(TEMP tmp, TEMP vtmp);
 4805   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4806   ins_encode %{
 4807     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4808     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4809 
 4810     uint x_idx = $idx$$constant & right_n_bits(1);
 4811     uint y_idx = ($idx$$constant >> 1) & 3;
 4812     __ movq($tmp$$Register, $val$$XMMRegister);
 4813     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4814     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4815     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4816   %}
 4817   ins_pipe( pipe_slow );
 4818 %}
 4819 
 4820 // ====================REDUCTION ARITHMETIC=======================================
 4821 
 4822 // =======================Int Reduction==========================================
 4823 
 4824 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4825   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4826   match(Set dst (AddReductionVI src1 src2));
 4827   match(Set dst (MulReductionVI src1 src2));
 4828   match(Set dst (AndReductionV  src1 src2));
 4829   match(Set dst ( OrReductionV  src1 src2));
 4830   match(Set dst (XorReductionV  src1 src2));
 4831   match(Set dst (MinReductionV  src1 src2));
 4832   match(Set dst (MaxReductionV  src1 src2));
 4833   effect(TEMP vtmp1, TEMP vtmp2);
 4834   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4835   ins_encode %{
 4836     int opcode = this->ideal_Opcode();
 4837     int vlen = Matcher::vector_length(this, $src2);
 4838     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4839   %}
 4840   ins_pipe( pipe_slow );
 4841 %}
 4842 
 4843 // =======================Long Reduction==========================================
 4844 
 4845 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4846   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4847   match(Set dst (AddReductionVL src1 src2));
 4848   match(Set dst (MulReductionVL src1 src2));
 4849   match(Set dst (AndReductionV  src1 src2));
 4850   match(Set dst ( OrReductionV  src1 src2));
 4851   match(Set dst (XorReductionV  src1 src2));
 4852   match(Set dst (MinReductionV  src1 src2));
 4853   match(Set dst (MaxReductionV  src1 src2));
 4854   effect(TEMP vtmp1, TEMP vtmp2);
 4855   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4856   ins_encode %{
 4857     int opcode = this->ideal_Opcode();
 4858     int vlen = Matcher::vector_length(this, $src2);
 4859     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4860   %}
 4861   ins_pipe( pipe_slow );
 4862 %}
 4863 
 4864 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4865   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4866   match(Set dst (AddReductionVL src1 src2));
 4867   match(Set dst (MulReductionVL src1 src2));
 4868   match(Set dst (AndReductionV  src1 src2));
 4869   match(Set dst ( OrReductionV  src1 src2));
 4870   match(Set dst (XorReductionV  src1 src2));
 4871   match(Set dst (MinReductionV  src1 src2));
 4872   match(Set dst (MaxReductionV  src1 src2));
 4873   effect(TEMP vtmp1, TEMP vtmp2);
 4874   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4875   ins_encode %{
 4876     int opcode = this->ideal_Opcode();
 4877     int vlen = Matcher::vector_length(this, $src2);
 4878     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4879   %}
 4880   ins_pipe( pipe_slow );
 4881 %}
 4882 
 4883 // =======================Float Reduction==========================================
 4884 
 4885 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4886   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
 4887   match(Set dst (AddReductionVF dst src));
 4888   match(Set dst (MulReductionVF dst src));
 4889   effect(TEMP dst, TEMP vtmp);
 4890   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4891   ins_encode %{
 4892     int opcode = this->ideal_Opcode();
 4893     int vlen = Matcher::vector_length(this, $src);
 4894     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4895   %}
 4896   ins_pipe( pipe_slow );
 4897 %}
 4898 
 4899 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4900   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 4901   match(Set dst (AddReductionVF dst src));
 4902   match(Set dst (MulReductionVF dst src));
 4903   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4904   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4905   ins_encode %{
 4906     int opcode = this->ideal_Opcode();
 4907     int vlen = Matcher::vector_length(this, $src);
 4908     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4909   %}
 4910   ins_pipe( pipe_slow );
 4911 %}
 4912 
 4913 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4914   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
 4915   match(Set dst (AddReductionVF dst src));
 4916   match(Set dst (MulReductionVF dst src));
 4917   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4918   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4919   ins_encode %{
 4920     int opcode = this->ideal_Opcode();
 4921     int vlen = Matcher::vector_length(this, $src);
 4922     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4923   %}
 4924   ins_pipe( pipe_slow );
 4925 %}
 4926 
 4927 
 4928 instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
 4929   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4930   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4931   // src1 contains reduction identity
 4932   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 4933   match(Set dst (AddReductionVF src1 src2));
 4934   match(Set dst (MulReductionVF src1 src2));
 4935   effect(TEMP dst);
 4936   format %{ "vector_reduction_float  $dst,$src1,$src2 ;" %}
 4937   ins_encode %{
 4938     int opcode = this->ideal_Opcode();
 4939     int vlen = Matcher::vector_length(this, $src2);
 4940     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 4941   %}
 4942   ins_pipe( pipe_slow );
 4943 %}
 4944 
 4945 instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
 4946   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4947   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4948   // src1 contains reduction identity
 4949   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 4950   match(Set dst (AddReductionVF src1 src2));
 4951   match(Set dst (MulReductionVF src1 src2));
 4952   effect(TEMP dst, TEMP vtmp);
 4953   format %{ "vector_reduction_float  $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 4954   ins_encode %{
 4955     int opcode = this->ideal_Opcode();
 4956     int vlen = Matcher::vector_length(this, $src2);
 4957     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 4958   %}
 4959   ins_pipe( pipe_slow );
 4960 %}
 4961 
 4962 instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
 4963   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4964   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4965   // src1 contains reduction identity
 4966   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 4967   match(Set dst (AddReductionVF src1 src2));
 4968   match(Set dst (MulReductionVF src1 src2));
 4969   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4970   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4971   ins_encode %{
 4972     int opcode = this->ideal_Opcode();
 4973     int vlen = Matcher::vector_length(this, $src2);
 4974     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4975   %}
 4976   ins_pipe( pipe_slow );
 4977 %}
 4978 
 4979 instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4980   // Non-strictly ordered floating-point add/mul reduction for floats. This rule is
 4981   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 4982   // src1 contains reduction identity
 4983   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
 4984   match(Set dst (AddReductionVF src1 src2));
 4985   match(Set dst (MulReductionVF src1 src2));
 4986   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4987   format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4988   ins_encode %{
 4989     int opcode = this->ideal_Opcode();
 4990     int vlen = Matcher::vector_length(this, $src2);
 4991     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4992   %}
 4993   ins_pipe( pipe_slow );
 4994 %}
 4995 
 4996 // =======================Double Reduction==========================================
 4997 
 4998 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4999   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
 5000   match(Set dst (AddReductionVD dst src));
 5001   match(Set dst (MulReductionVD dst src));
 5002   effect(TEMP dst, TEMP vtmp);
 5003   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5004   ins_encode %{
 5005     int opcode = this->ideal_Opcode();
 5006     int vlen = Matcher::vector_length(this, $src);
 5007     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5008 %}
 5009   ins_pipe( pipe_slow );
 5010 %}
 5011 
 5012 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5013   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
 5014   match(Set dst (AddReductionVD dst src));
 5015   match(Set dst (MulReductionVD dst src));
 5016   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5017   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5018   ins_encode %{
 5019     int opcode = this->ideal_Opcode();
 5020     int vlen = Matcher::vector_length(this, $src);
 5021     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5022   %}
 5023   ins_pipe( pipe_slow );
 5024 %}
 5025 
 5026 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5027   predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
 5028   match(Set dst (AddReductionVD dst src));
 5029   match(Set dst (MulReductionVD dst src));
 5030   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5031   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5032   ins_encode %{
 5033     int opcode = this->ideal_Opcode();
 5034     int vlen = Matcher::vector_length(this, $src);
 5035     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5036   %}
 5037   ins_pipe( pipe_slow );
 5038 %}
 5039 
 5040 instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
 5041   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5042   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5043   // src1 contains reduction identity
 5044   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
 5045   match(Set dst (AddReductionVD src1 src2));
 5046   match(Set dst (MulReductionVD src1 src2));
 5047   effect(TEMP dst);
 5048   format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
 5049   ins_encode %{
 5050     int opcode = this->ideal_Opcode();
 5051     int vlen = Matcher::vector_length(this, $src2);
 5052     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
 5053 %}
 5054   ins_pipe( pipe_slow );
 5055 %}
 5056 
 5057 instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
 5058   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5059   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5060   // src1 contains reduction identity
 5061   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
 5062   match(Set dst (AddReductionVD src1 src2));
 5063   match(Set dst (MulReductionVD src1 src2));
 5064   effect(TEMP dst, TEMP vtmp);
 5065   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
 5066   ins_encode %{
 5067     int opcode = this->ideal_Opcode();
 5068     int vlen = Matcher::vector_length(this, $src2);
 5069     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 5070   %}
 5071   ins_pipe( pipe_slow );
 5072 %}
 5073 
 5074 instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5075   // Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
 5076   // intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
 5077   // src1 contains reduction identity
 5078   predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
 5079   match(Set dst (AddReductionVD src1 src2));
 5080   match(Set dst (MulReductionVD src1 src2));
 5081   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5082   format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5083   ins_encode %{
 5084     int opcode = this->ideal_Opcode();
 5085     int vlen = Matcher::vector_length(this, $src2);
 5086     __ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5087   %}
 5088   ins_pipe( pipe_slow );
 5089 %}
 5090 
 5091 // =======================Byte Reduction==========================================
 5092 
 5093 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5094   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5095   match(Set dst (AddReductionVI src1 src2));
 5096   match(Set dst (AndReductionV  src1 src2));
 5097   match(Set dst ( OrReductionV  src1 src2));
 5098   match(Set dst (XorReductionV  src1 src2));
 5099   match(Set dst (MinReductionV  src1 src2));
 5100   match(Set dst (MaxReductionV  src1 src2));
 5101   effect(TEMP vtmp1, TEMP vtmp2);
 5102   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5103   ins_encode %{
 5104     int opcode = this->ideal_Opcode();
 5105     int vlen = Matcher::vector_length(this, $src2);
 5106     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5107   %}
 5108   ins_pipe( pipe_slow );
 5109 %}
 5110 
 5111 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5112   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5113   match(Set dst (AddReductionVI src1 src2));
 5114   match(Set dst (AndReductionV  src1 src2));
 5115   match(Set dst ( OrReductionV  src1 src2));
 5116   match(Set dst (XorReductionV  src1 src2));
 5117   match(Set dst (MinReductionV  src1 src2));
 5118   match(Set dst (MaxReductionV  src1 src2));
 5119   effect(TEMP vtmp1, TEMP vtmp2);
 5120   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5121   ins_encode %{
 5122     int opcode = this->ideal_Opcode();
 5123     int vlen = Matcher::vector_length(this, $src2);
 5124     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5125   %}
 5126   ins_pipe( pipe_slow );
 5127 %}
 5128 
 5129 // =======================Short Reduction==========================================
 5130 
 5131 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5132   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5133   match(Set dst (AddReductionVI src1 src2));
 5134   match(Set dst (MulReductionVI src1 src2));
 5135   match(Set dst (AndReductionV  src1 src2));
 5136   match(Set dst ( OrReductionV  src1 src2));
 5137   match(Set dst (XorReductionV  src1 src2));
 5138   match(Set dst (MinReductionV  src1 src2));
 5139   match(Set dst (MaxReductionV  src1 src2));
 5140   effect(TEMP vtmp1, TEMP vtmp2);
 5141   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5142   ins_encode %{
 5143     int opcode = this->ideal_Opcode();
 5144     int vlen = Matcher::vector_length(this, $src2);
 5145     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5146   %}
 5147   ins_pipe( pipe_slow );
 5148 %}
 5149 
 5150 // =======================Mul Reduction==========================================
 5151 
 5152 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5153   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5154             Matcher::vector_length(n->in(2)) <= 32); // src2
 5155   match(Set dst (MulReductionVI src1 src2));
 5156   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5157   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5158   ins_encode %{
 5159     int opcode = this->ideal_Opcode();
 5160     int vlen = Matcher::vector_length(this, $src2);
 5161     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5162   %}
 5163   ins_pipe( pipe_slow );
 5164 %}
 5165 
 5166 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5167   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5168             Matcher::vector_length(n->in(2)) == 64); // src2
 5169   match(Set dst (MulReductionVI src1 src2));
 5170   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5171   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5172   ins_encode %{
 5173     int opcode = this->ideal_Opcode();
 5174     int vlen = Matcher::vector_length(this, $src2);
 5175     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5176   %}
 5177   ins_pipe( pipe_slow );
 5178 %}
 5179 
 5180 //--------------------Min/Max Float Reduction --------------------
 5181 // Float Min Reduction
 5182 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5183                             legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5184   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5185             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5186              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5187             Matcher::vector_length(n->in(2)) == 2);
 5188   match(Set dst (MinReductionV src1 src2));
 5189   match(Set dst (MaxReductionV src1 src2));
 5190   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5191   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5192   ins_encode %{
 5193     assert(UseAVX > 0, "sanity");
 5194 
 5195     int opcode = this->ideal_Opcode();
 5196     int vlen = Matcher::vector_length(this, $src2);
 5197     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5198                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5199   %}
 5200   ins_pipe( pipe_slow );
 5201 %}
 5202 
 5203 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5204                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5205   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5206             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5207              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5208             Matcher::vector_length(n->in(2)) >= 4);
 5209   match(Set dst (MinReductionV src1 src2));
 5210   match(Set dst (MaxReductionV src1 src2));
 5211   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5212   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5213   ins_encode %{
 5214     assert(UseAVX > 0, "sanity");
 5215 
 5216     int opcode = this->ideal_Opcode();
 5217     int vlen = Matcher::vector_length(this, $src2);
 5218     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5219                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5220   %}
 5221   ins_pipe( pipe_slow );
 5222 %}
 5223 
 5224 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp, legVec atmp,
 5225                                legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5226   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5227             Matcher::vector_length(n->in(2)) == 2);
 5228   match(Set dst (MinReductionV dst src));
 5229   match(Set dst (MaxReductionV dst src));
 5230   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5231   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5232   ins_encode %{
 5233     assert(UseAVX > 0, "sanity");
 5234 
 5235     int opcode = this->ideal_Opcode();
 5236     int vlen = Matcher::vector_length(this, $src);
 5237     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5238                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5239   %}
 5240   ins_pipe( pipe_slow );
 5241 %}
 5242 
 5243 
 5244 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp, legVec atmp, legVec btmp,
 5245                               legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5246   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5247             Matcher::vector_length(n->in(2)) >= 4);
 5248   match(Set dst (MinReductionV dst src));
 5249   match(Set dst (MaxReductionV dst src));
 5250   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5251   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5252   ins_encode %{
 5253     assert(UseAVX > 0, "sanity");
 5254 
 5255     int opcode = this->ideal_Opcode();
 5256     int vlen = Matcher::vector_length(this, $src);
 5257     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5258                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5259   %}
 5260   ins_pipe( pipe_slow );
 5261 %}
 5262 
 5263 instruct minmax_reduction2F_avx10(regF dst, immF src1, vec src2, vec xtmp1) %{
 5264   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5265             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5266              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5267             Matcher::vector_length(n->in(2)) == 2);
 5268   match(Set dst (MinReductionV src1 src2));
 5269   match(Set dst (MaxReductionV src1 src2));
 5270   effect(TEMP dst, TEMP xtmp1);
 5271   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 as TEMP" %}
 5272   ins_encode %{
 5273     int opcode = this->ideal_Opcode();
 5274     int vlen = Matcher::vector_length(this, $src2);
 5275     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5276                          xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5277   %}
 5278   ins_pipe( pipe_slow );
 5279 %}
 5280 
 5281 instruct minmax_reductionF_avx10(regF dst, immF src1, vec src2, vec xtmp1, vec xtmp2) %{
 5282   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5283             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5284              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5285             Matcher::vector_length(n->in(2)) >= 4);
 5286   match(Set dst (MinReductionV src1 src2));
 5287   match(Set dst (MaxReductionV src1 src2));
 5288   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5289   format %{ "vector_minmax_reduction $dst, $src1, $src2 \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5290   ins_encode %{
 5291     int opcode = this->ideal_Opcode();
 5292     int vlen = Matcher::vector_length(this, $src2);
 5293     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5294                          xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5295   %}
 5296   ins_pipe( pipe_slow );
 5297 %}
 5298 
 5299 instruct minmax_reduction2F_avx10_av(regF dst, vec src, vec xtmp1) %{
 5300   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5301             Matcher::vector_length(n->in(2)) == 2);
 5302   match(Set dst (MinReductionV dst src));
 5303   match(Set dst (MaxReductionV dst src));
 5304   effect(TEMP dst, TEMP xtmp1);
 5305   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 as TEMP" %}
 5306   ins_encode %{
 5307     int opcode = this->ideal_Opcode();
 5308     int vlen = Matcher::vector_length(this, $src);
 5309     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5310                          $xtmp1$$XMMRegister);
 5311   %}
 5312   ins_pipe( pipe_slow );
 5313 %}
 5314 
 5315 instruct minmax_reductionF_avx10_av(regF dst, vec src, vec xtmp1, vec xtmp2) %{
 5316   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5317             Matcher::vector_length(n->in(2)) >= 4);
 5318   match(Set dst (MinReductionV dst src));
 5319   match(Set dst (MaxReductionV dst src));
 5320   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5321   format %{ "vector_minmax2F_reduction $dst, $src \t; using $xtmp1 and $xtmp2 as TEMP" %}
 5322   ins_encode %{
 5323     int opcode = this->ideal_Opcode();
 5324     int vlen = Matcher::vector_length(this, $src);
 5325     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg,
 5326                          $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5327   %}
 5328   ins_pipe( pipe_slow );
 5329 %}
 5330 
 5331 //--------------------Min Double Reduction --------------------
 5332 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5333                             legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5334   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5335             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5336              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5337             Matcher::vector_length(n->in(2)) == 2);
 5338   match(Set dst (MinReductionV src1 src2));
 5339   match(Set dst (MaxReductionV src1 src2));
 5340   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5341   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5342   ins_encode %{
 5343     assert(UseAVX > 0, "sanity");
 5344 
 5345     int opcode = this->ideal_Opcode();
 5346     int vlen = Matcher::vector_length(this, $src2);
 5347     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5348                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5349   %}
 5350   ins_pipe( pipe_slow );
 5351 %}
 5352 
 5353 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2, legVec tmp1, legVec tmp2,
 5354                            legVec tmp3, legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5355   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5356             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5357              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5358             Matcher::vector_length(n->in(2)) >= 4);
 5359   match(Set dst (MinReductionV src1 src2));
 5360   match(Set dst (MaxReductionV src1 src2));
 5361   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5362   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5363   ins_encode %{
 5364     assert(UseAVX > 0, "sanity");
 5365 
 5366     int opcode = this->ideal_Opcode();
 5367     int vlen = Matcher::vector_length(this, $src2);
 5368     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5369                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5370   %}
 5371   ins_pipe( pipe_slow );
 5372 %}
 5373 
 5374 
 5375 instruct minmax_reduction2D_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2,
 5376                                legVec tmp3, legVec tmp4, rFlagsReg cr) %{
 5377   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5378             Matcher::vector_length(n->in(2)) == 2);
 5379   match(Set dst (MinReductionV dst src));
 5380   match(Set dst (MaxReductionV dst src));
 5381   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5382   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5383   ins_encode %{
 5384     assert(UseAVX > 0, "sanity");
 5385 
 5386     int opcode = this->ideal_Opcode();
 5387     int vlen = Matcher::vector_length(this, $src);
 5388     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5389                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5390   %}
 5391   ins_pipe( pipe_slow );
 5392 %}
 5393 
 5394 instruct minmax_reductionD_av(legRegD dst, legVec src, legVec tmp1, legVec tmp2, legVec tmp3,
 5395                               legVec tmp4, legVec tmp5, rFlagsReg cr) %{
 5396   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5397             Matcher::vector_length(n->in(2)) >= 4);
 5398   match(Set dst (MinReductionV dst src));
 5399   match(Set dst (MaxReductionV dst src));
 5400   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5401   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5402   ins_encode %{
 5403     assert(UseAVX > 0, "sanity");
 5404 
 5405     int opcode = this->ideal_Opcode();
 5406     int vlen = Matcher::vector_length(this, $src);
 5407     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5408                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5409   %}
 5410   ins_pipe( pipe_slow );
 5411 %}
 5412 
 5413 instruct minmax_reduction2D_avx10(regD dst, immD src1, vec src2, vec xtmp1) %{
 5414   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5415             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5416              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5417             Matcher::vector_length(n->in(2)) == 2);
 5418   match(Set dst (MinReductionV src1 src2));
 5419   match(Set dst (MaxReductionV src1 src2));
 5420   effect(TEMP dst, TEMP xtmp1);
 5421   format %{ "vector_minmax2D_reduction $dst, $src1, $src2 ; using $xtmp1 as TEMP" %}
 5422   ins_encode %{
 5423     int opcode = this->ideal_Opcode();
 5424     int vlen = Matcher::vector_length(this, $src2);
 5425     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg,
 5426                           xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5427   %}
 5428   ins_pipe( pipe_slow );
 5429 %}
 5430 
 5431 instruct minmax_reductionD_avx10(regD dst, immD src1, vec src2, vec xtmp1, vec xtmp2) %{
 5432   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5433             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5434              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5435             Matcher::vector_length(n->in(2)) >= 4);
 5436   match(Set dst (MinReductionV src1 src2));
 5437   match(Set dst (MaxReductionV src1 src2));
 5438   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5439   format %{ "vector_minmaxD_reduction $dst, $src1, $src2 ; using $xtmp1 and $xtmp2 as TEMP" %}
 5440   ins_encode %{
 5441     int opcode = this->ideal_Opcode();
 5442     int vlen = Matcher::vector_length(this, $src2);
 5443     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg,
 5444                           xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5445   %}
 5446   ins_pipe( pipe_slow );
 5447 %}
 5448 
 5449 
 5450 instruct minmax_reduction2D_av_avx10(regD dst, vec src, vec xtmp1) %{
 5451   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5452             Matcher::vector_length(n->in(2)) == 2);
 5453   match(Set dst (MinReductionV dst src));
 5454   match(Set dst (MaxReductionV dst src));
 5455   effect(TEMP dst, TEMP xtmp1);
 5456   format %{ "vector_minmax2D_reduction $dst, $src ; using $xtmp1 as TEMP" %}
 5457   ins_encode %{
 5458     int opcode = this->ideal_Opcode();
 5459     int vlen = Matcher::vector_length(this, $src);
 5460     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5461                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister);
 5462   %}
 5463   ins_pipe( pipe_slow );
 5464 %}
 5465 
 5466 instruct minmax_reductionD_av_avx10(regD dst, vec src, vec xtmp1, vec xtmp2) %{
 5467   predicate(VM_Version::supports_avx10_2() && Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5468             Matcher::vector_length(n->in(2)) >= 4);
 5469   match(Set dst (MinReductionV dst src));
 5470   match(Set dst (MaxReductionV dst src));
 5471   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 5472   format %{ "vector_minmaxD_reduction $dst, $src ; using $xtmp1 and $xtmp2 as TEMP" %}
 5473   ins_encode %{
 5474     int opcode = this->ideal_Opcode();
 5475     int vlen = Matcher::vector_length(this, $src);
 5476     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5477                           xnoreg, xnoreg, xnoreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
 5478   %}
 5479   ins_pipe( pipe_slow );
 5480 %}
 5481 
 5482 // ====================VECTOR ARITHMETIC=======================================
 5483 
 5484 // --------------------------------- ADD --------------------------------------
 5485 
 5486 // Bytes vector add
 5487 instruct vaddB(vec dst, vec src) %{
 5488   predicate(UseAVX == 0);
 5489   match(Set dst (AddVB dst src));
 5490   format %{ "paddb   $dst,$src\t! add packedB" %}
 5491   ins_encode %{
 5492     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5493   %}
 5494   ins_pipe( pipe_slow );
 5495 %}
 5496 
 5497 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5498   predicate(UseAVX > 0);
 5499   match(Set dst (AddVB src1 src2));
 5500   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5501   ins_encode %{
 5502     int vlen_enc = vector_length_encoding(this);
 5503     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5504   %}
 5505   ins_pipe( pipe_slow );
 5506 %}
 5507 
 5508 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5509   predicate((UseAVX > 0) &&
 5510             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5511   match(Set dst (AddVB src (LoadVector mem)));
 5512   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5513   ins_encode %{
 5514     int vlen_enc = vector_length_encoding(this);
 5515     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5516   %}
 5517   ins_pipe( pipe_slow );
 5518 %}
 5519 
 5520 // Shorts/Chars vector add
 5521 instruct vaddS(vec dst, vec src) %{
 5522   predicate(UseAVX == 0);
 5523   match(Set dst (AddVS dst src));
 5524   format %{ "paddw   $dst,$src\t! add packedS" %}
 5525   ins_encode %{
 5526     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5527   %}
 5528   ins_pipe( pipe_slow );
 5529 %}
 5530 
 5531 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5532   predicate(UseAVX > 0);
 5533   match(Set dst (AddVS src1 src2));
 5534   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5535   ins_encode %{
 5536     int vlen_enc = vector_length_encoding(this);
 5537     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5538   %}
 5539   ins_pipe( pipe_slow );
 5540 %}
 5541 
 5542 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5543   predicate((UseAVX > 0) &&
 5544             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5545   match(Set dst (AddVS src (LoadVector mem)));
 5546   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5547   ins_encode %{
 5548     int vlen_enc = vector_length_encoding(this);
 5549     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5550   %}
 5551   ins_pipe( pipe_slow );
 5552 %}
 5553 
 5554 // Integers vector add
 5555 instruct vaddI(vec dst, vec src) %{
 5556   predicate(UseAVX == 0);
 5557   match(Set dst (AddVI dst src));
 5558   format %{ "paddd   $dst,$src\t! add packedI" %}
 5559   ins_encode %{
 5560     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5561   %}
 5562   ins_pipe( pipe_slow );
 5563 %}
 5564 
 5565 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5566   predicate(UseAVX > 0);
 5567   match(Set dst (AddVI src1 src2));
 5568   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5569   ins_encode %{
 5570     int vlen_enc = vector_length_encoding(this);
 5571     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5572   %}
 5573   ins_pipe( pipe_slow );
 5574 %}
 5575 
 5576 
 5577 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5578   predicate((UseAVX > 0) &&
 5579             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5580   match(Set dst (AddVI src (LoadVector mem)));
 5581   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5582   ins_encode %{
 5583     int vlen_enc = vector_length_encoding(this);
 5584     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5585   %}
 5586   ins_pipe( pipe_slow );
 5587 %}
 5588 
 5589 // Longs vector add
 5590 instruct vaddL(vec dst, vec src) %{
 5591   predicate(UseAVX == 0);
 5592   match(Set dst (AddVL dst src));
 5593   format %{ "paddq   $dst,$src\t! add packedL" %}
 5594   ins_encode %{
 5595     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5596   %}
 5597   ins_pipe( pipe_slow );
 5598 %}
 5599 
 5600 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5601   predicate(UseAVX > 0);
 5602   match(Set dst (AddVL src1 src2));
 5603   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5604   ins_encode %{
 5605     int vlen_enc = vector_length_encoding(this);
 5606     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5607   %}
 5608   ins_pipe( pipe_slow );
 5609 %}
 5610 
 5611 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5612   predicate((UseAVX > 0) &&
 5613             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5614   match(Set dst (AddVL src (LoadVector mem)));
 5615   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5616   ins_encode %{
 5617     int vlen_enc = vector_length_encoding(this);
 5618     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5619   %}
 5620   ins_pipe( pipe_slow );
 5621 %}
 5622 
 5623 // Floats vector add
 5624 instruct vaddF(vec dst, vec src) %{
 5625   predicate(UseAVX == 0);
 5626   match(Set dst (AddVF dst src));
 5627   format %{ "addps   $dst,$src\t! add packedF" %}
 5628   ins_encode %{
 5629     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5630   %}
 5631   ins_pipe( pipe_slow );
 5632 %}
 5633 
 5634 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5635   predicate(UseAVX > 0);
 5636   match(Set dst (AddVF src1 src2));
 5637   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5638   ins_encode %{
 5639     int vlen_enc = vector_length_encoding(this);
 5640     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5641   %}
 5642   ins_pipe( pipe_slow );
 5643 %}
 5644 
 5645 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5646   predicate((UseAVX > 0) &&
 5647             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5648   match(Set dst (AddVF src (LoadVector mem)));
 5649   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5650   ins_encode %{
 5651     int vlen_enc = vector_length_encoding(this);
 5652     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5653   %}
 5654   ins_pipe( pipe_slow );
 5655 %}
 5656 
 5657 // Doubles vector add
 5658 instruct vaddD(vec dst, vec src) %{
 5659   predicate(UseAVX == 0);
 5660   match(Set dst (AddVD dst src));
 5661   format %{ "addpd   $dst,$src\t! add packedD" %}
 5662   ins_encode %{
 5663     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5664   %}
 5665   ins_pipe( pipe_slow );
 5666 %}
 5667 
 5668 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5669   predicate(UseAVX > 0);
 5670   match(Set dst (AddVD src1 src2));
 5671   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5672   ins_encode %{
 5673     int vlen_enc = vector_length_encoding(this);
 5674     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5675   %}
 5676   ins_pipe( pipe_slow );
 5677 %}
 5678 
 5679 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5680   predicate((UseAVX > 0) &&
 5681             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5682   match(Set dst (AddVD src (LoadVector mem)));
 5683   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5684   ins_encode %{
 5685     int vlen_enc = vector_length_encoding(this);
 5686     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5687   %}
 5688   ins_pipe( pipe_slow );
 5689 %}
 5690 
 5691 // --------------------------------- SUB --------------------------------------
 5692 
 5693 // Bytes vector sub
 5694 instruct vsubB(vec dst, vec src) %{
 5695   predicate(UseAVX == 0);
 5696   match(Set dst (SubVB dst src));
 5697   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5698   ins_encode %{
 5699     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5700   %}
 5701   ins_pipe( pipe_slow );
 5702 %}
 5703 
 5704 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5705   predicate(UseAVX > 0);
 5706   match(Set dst (SubVB src1 src2));
 5707   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5708   ins_encode %{
 5709     int vlen_enc = vector_length_encoding(this);
 5710     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5711   %}
 5712   ins_pipe( pipe_slow );
 5713 %}
 5714 
 5715 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5716   predicate((UseAVX > 0) &&
 5717             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5718   match(Set dst (SubVB src (LoadVector mem)));
 5719   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5720   ins_encode %{
 5721     int vlen_enc = vector_length_encoding(this);
 5722     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5723   %}
 5724   ins_pipe( pipe_slow );
 5725 %}
 5726 
 5727 // Shorts/Chars vector sub
 5728 instruct vsubS(vec dst, vec src) %{
 5729   predicate(UseAVX == 0);
 5730   match(Set dst (SubVS dst src));
 5731   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5732   ins_encode %{
 5733     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5734   %}
 5735   ins_pipe( pipe_slow );
 5736 %}
 5737 
 5738 
 5739 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5740   predicate(UseAVX > 0);
 5741   match(Set dst (SubVS src1 src2));
 5742   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5743   ins_encode %{
 5744     int vlen_enc = vector_length_encoding(this);
 5745     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5746   %}
 5747   ins_pipe( pipe_slow );
 5748 %}
 5749 
 5750 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5751   predicate((UseAVX > 0) &&
 5752             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5753   match(Set dst (SubVS src (LoadVector mem)));
 5754   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5755   ins_encode %{
 5756     int vlen_enc = vector_length_encoding(this);
 5757     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5758   %}
 5759   ins_pipe( pipe_slow );
 5760 %}
 5761 
 5762 // Integers vector sub
 5763 instruct vsubI(vec dst, vec src) %{
 5764   predicate(UseAVX == 0);
 5765   match(Set dst (SubVI dst src));
 5766   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5767   ins_encode %{
 5768     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5769   %}
 5770   ins_pipe( pipe_slow );
 5771 %}
 5772 
 5773 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5774   predicate(UseAVX > 0);
 5775   match(Set dst (SubVI src1 src2));
 5776   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5777   ins_encode %{
 5778     int vlen_enc = vector_length_encoding(this);
 5779     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5780   %}
 5781   ins_pipe( pipe_slow );
 5782 %}
 5783 
 5784 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5785   predicate((UseAVX > 0) &&
 5786             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5787   match(Set dst (SubVI src (LoadVector mem)));
 5788   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5789   ins_encode %{
 5790     int vlen_enc = vector_length_encoding(this);
 5791     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5792   %}
 5793   ins_pipe( pipe_slow );
 5794 %}
 5795 
 5796 // Longs vector sub
 5797 instruct vsubL(vec dst, vec src) %{
 5798   predicate(UseAVX == 0);
 5799   match(Set dst (SubVL dst src));
 5800   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5801   ins_encode %{
 5802     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5803   %}
 5804   ins_pipe( pipe_slow );
 5805 %}
 5806 
 5807 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5808   predicate(UseAVX > 0);
 5809   match(Set dst (SubVL src1 src2));
 5810   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5811   ins_encode %{
 5812     int vlen_enc = vector_length_encoding(this);
 5813     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5814   %}
 5815   ins_pipe( pipe_slow );
 5816 %}
 5817 
 5818 
 5819 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5820   predicate((UseAVX > 0) &&
 5821             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5822   match(Set dst (SubVL src (LoadVector mem)));
 5823   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5824   ins_encode %{
 5825     int vlen_enc = vector_length_encoding(this);
 5826     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5827   %}
 5828   ins_pipe( pipe_slow );
 5829 %}
 5830 
 5831 // Floats vector sub
 5832 instruct vsubF(vec dst, vec src) %{
 5833   predicate(UseAVX == 0);
 5834   match(Set dst (SubVF dst src));
 5835   format %{ "subps   $dst,$src\t! sub packedF" %}
 5836   ins_encode %{
 5837     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5838   %}
 5839   ins_pipe( pipe_slow );
 5840 %}
 5841 
 5842 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5843   predicate(UseAVX > 0);
 5844   match(Set dst (SubVF src1 src2));
 5845   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5846   ins_encode %{
 5847     int vlen_enc = vector_length_encoding(this);
 5848     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5849   %}
 5850   ins_pipe( pipe_slow );
 5851 %}
 5852 
 5853 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5854   predicate((UseAVX > 0) &&
 5855             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5856   match(Set dst (SubVF src (LoadVector mem)));
 5857   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5858   ins_encode %{
 5859     int vlen_enc = vector_length_encoding(this);
 5860     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5861   %}
 5862   ins_pipe( pipe_slow );
 5863 %}
 5864 
 5865 // Doubles vector sub
 5866 instruct vsubD(vec dst, vec src) %{
 5867   predicate(UseAVX == 0);
 5868   match(Set dst (SubVD dst src));
 5869   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5870   ins_encode %{
 5871     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5872   %}
 5873   ins_pipe( pipe_slow );
 5874 %}
 5875 
 5876 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5877   predicate(UseAVX > 0);
 5878   match(Set dst (SubVD src1 src2));
 5879   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5880   ins_encode %{
 5881     int vlen_enc = vector_length_encoding(this);
 5882     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5883   %}
 5884   ins_pipe( pipe_slow );
 5885 %}
 5886 
 5887 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5888   predicate((UseAVX > 0) &&
 5889             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5890   match(Set dst (SubVD src (LoadVector mem)));
 5891   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5892   ins_encode %{
 5893     int vlen_enc = vector_length_encoding(this);
 5894     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5895   %}
 5896   ins_pipe( pipe_slow );
 5897 %}
 5898 
 5899 // --------------------------------- MUL --------------------------------------
 5900 
 5901 // Byte vector mul
 5902 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5903   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5904   match(Set dst (MulVB src1 src2));
 5905   effect(TEMP dst, TEMP xtmp);
 5906   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5907   ins_encode %{
 5908     assert(UseSSE > 3, "required");
 5909     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5910     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5911     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5912     __ psllw($dst$$XMMRegister, 8);
 5913     __ psrlw($dst$$XMMRegister, 8);
 5914     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5915   %}
 5916   ins_pipe( pipe_slow );
 5917 %}
 5918 
 5919 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5920   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5921   match(Set dst (MulVB src1 src2));
 5922   effect(TEMP dst, TEMP xtmp);
 5923   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5924   ins_encode %{
 5925     assert(UseSSE > 3, "required");
 5926     // Odd-index elements
 5927     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5928     __ psrlw($dst$$XMMRegister, 8);
 5929     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5930     __ psrlw($xtmp$$XMMRegister, 8);
 5931     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5932     __ psllw($dst$$XMMRegister, 8);
 5933     // Even-index elements
 5934     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5935     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5936     __ psllw($xtmp$$XMMRegister, 8);
 5937     __ psrlw($xtmp$$XMMRegister, 8);
 5938     // Combine
 5939     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5940   %}
 5941   ins_pipe( pipe_slow );
 5942 %}
 5943 
 5944 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5945   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5946   match(Set dst (MulVB src1 src2));
 5947   effect(TEMP xtmp1, TEMP xtmp2);
 5948   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5949   ins_encode %{
 5950     int vlen_enc = vector_length_encoding(this);
 5951     // Odd-index elements
 5952     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5953     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5954     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5955     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5956     // Even-index elements
 5957     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5958     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5959     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5960     // Combine
 5961     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5962   %}
 5963   ins_pipe( pipe_slow );
 5964 %}
 5965 
 5966 // Shorts/Chars vector mul
 5967 instruct vmulS(vec dst, vec src) %{
 5968   predicate(UseAVX == 0);
 5969   match(Set dst (MulVS dst src));
 5970   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5971   ins_encode %{
 5972     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5973   %}
 5974   ins_pipe( pipe_slow );
 5975 %}
 5976 
 5977 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5978   predicate(UseAVX > 0);
 5979   match(Set dst (MulVS src1 src2));
 5980   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5981   ins_encode %{
 5982     int vlen_enc = vector_length_encoding(this);
 5983     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5984   %}
 5985   ins_pipe( pipe_slow );
 5986 %}
 5987 
 5988 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5989   predicate((UseAVX > 0) &&
 5990             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5991   match(Set dst (MulVS src (LoadVector mem)));
 5992   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5993   ins_encode %{
 5994     int vlen_enc = vector_length_encoding(this);
 5995     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5996   %}
 5997   ins_pipe( pipe_slow );
 5998 %}
 5999 
 6000 // Integers vector mul
 6001 instruct vmulI(vec dst, vec src) %{
 6002   predicate(UseAVX == 0);
 6003   match(Set dst (MulVI dst src));
 6004   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 6005   ins_encode %{
 6006     assert(UseSSE > 3, "required");
 6007     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 6008   %}
 6009   ins_pipe( pipe_slow );
 6010 %}
 6011 
 6012 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 6013   predicate(UseAVX > 0);
 6014   match(Set dst (MulVI src1 src2));
 6015   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 6016   ins_encode %{
 6017     int vlen_enc = vector_length_encoding(this);
 6018     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6019   %}
 6020   ins_pipe( pipe_slow );
 6021 %}
 6022 
 6023 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 6024   predicate((UseAVX > 0) &&
 6025             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6026   match(Set dst (MulVI src (LoadVector mem)));
 6027   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 6028   ins_encode %{
 6029     int vlen_enc = vector_length_encoding(this);
 6030     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6031   %}
 6032   ins_pipe( pipe_slow );
 6033 %}
 6034 
 6035 // Longs vector mul
 6036 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 6037   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6038              VM_Version::supports_avx512dq()) ||
 6039             VM_Version::supports_avx512vldq());
 6040   match(Set dst (MulVL src1 src2));
 6041   ins_cost(500);
 6042   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 6043   ins_encode %{
 6044     assert(UseAVX > 2, "required");
 6045     int vlen_enc = vector_length_encoding(this);
 6046     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6047   %}
 6048   ins_pipe( pipe_slow );
 6049 %}
 6050 
 6051 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 6052   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 6053              VM_Version::supports_avx512dq()) ||
 6054             (Matcher::vector_length_in_bytes(n) > 8 &&
 6055              VM_Version::supports_avx512vldq()));
 6056   match(Set dst (MulVL src (LoadVector mem)));
 6057   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 6058   ins_cost(500);
 6059   ins_encode %{
 6060     assert(UseAVX > 2, "required");
 6061     int vlen_enc = vector_length_encoding(this);
 6062     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6063   %}
 6064   ins_pipe( pipe_slow );
 6065 %}
 6066 
 6067 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 6068   predicate(UseAVX == 0);
 6069   match(Set dst (MulVL src1 src2));
 6070   ins_cost(500);
 6071   effect(TEMP dst, TEMP xtmp);
 6072   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 6073   ins_encode %{
 6074     assert(VM_Version::supports_sse4_1(), "required");
 6075     // Get the lo-hi products, only the lower 32 bits is in concerns
 6076     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 6077     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 6078     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 6079     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 6080     __ psllq($dst$$XMMRegister, 32);
 6081     // Get the lo-lo products
 6082     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 6083     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 6084     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 6085   %}
 6086   ins_pipe( pipe_slow );
 6087 %}
 6088 
 6089 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 6090   predicate(UseAVX > 0 &&
 6091             ((Matcher::vector_length_in_bytes(n) == 64 &&
 6092               !VM_Version::supports_avx512dq()) ||
 6093              (Matcher::vector_length_in_bytes(n) < 64 &&
 6094               !VM_Version::supports_avx512vldq())));
 6095   match(Set dst (MulVL src1 src2));
 6096   effect(TEMP xtmp1, TEMP xtmp2);
 6097   ins_cost(500);
 6098   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 6099   ins_encode %{
 6100     int vlen_enc = vector_length_encoding(this);
 6101     // Get the lo-hi products, only the lower 32 bits is in concerns
 6102     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 6103     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6104     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 6105     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 6106     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 6107     // Get the lo-lo products
 6108     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6109     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6110   %}
 6111   ins_pipe( pipe_slow );
 6112 %}
 6113 
 6114 instruct vmuludq_reg(vec dst, vec src1, vec src2) %{
 6115   predicate(UseAVX > 0 && n->as_MulVL()->has_uint_inputs());
 6116   match(Set dst (MulVL src1 src2));
 6117   ins_cost(100);
 6118   format %{ "vpmuludq $dst,$src1,$src2\t! muludq packedL" %}
 6119   ins_encode %{
 6120     int vlen_enc = vector_length_encoding(this);
 6121     __ vpmuludq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6122   %}
 6123   ins_pipe( pipe_slow );
 6124 %}
 6125 
 6126 instruct vmuldq_reg(vec dst, vec src1, vec src2) %{
 6127   predicate(UseAVX > 0 && n->as_MulVL()->has_int_inputs());
 6128   match(Set dst (MulVL src1 src2));
 6129   ins_cost(100);
 6130   format %{ "vpmuldq $dst,$src1,$src2\t! muldq packedL" %}
 6131   ins_encode %{
 6132     int vlen_enc = vector_length_encoding(this);
 6133     __ vpmuldq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6134   %}
 6135   ins_pipe( pipe_slow );
 6136 %}
 6137 
 6138 // Floats vector mul
 6139 instruct vmulF(vec dst, vec src) %{
 6140   predicate(UseAVX == 0);
 6141   match(Set dst (MulVF dst src));
 6142   format %{ "mulps   $dst,$src\t! mul packedF" %}
 6143   ins_encode %{
 6144     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 6145   %}
 6146   ins_pipe( pipe_slow );
 6147 %}
 6148 
 6149 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 6150   predicate(UseAVX > 0);
 6151   match(Set dst (MulVF src1 src2));
 6152   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 6153   ins_encode %{
 6154     int vlen_enc = vector_length_encoding(this);
 6155     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6156   %}
 6157   ins_pipe( pipe_slow );
 6158 %}
 6159 
 6160 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 6161   predicate((UseAVX > 0) &&
 6162             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6163   match(Set dst (MulVF src (LoadVector mem)));
 6164   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 6165   ins_encode %{
 6166     int vlen_enc = vector_length_encoding(this);
 6167     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6168   %}
 6169   ins_pipe( pipe_slow );
 6170 %}
 6171 
 6172 // Doubles vector mul
 6173 instruct vmulD(vec dst, vec src) %{
 6174   predicate(UseAVX == 0);
 6175   match(Set dst (MulVD dst src));
 6176   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 6177   ins_encode %{
 6178     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 6179   %}
 6180   ins_pipe( pipe_slow );
 6181 %}
 6182 
 6183 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 6184   predicate(UseAVX > 0);
 6185   match(Set dst (MulVD src1 src2));
 6186   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 6187   ins_encode %{
 6188     int vlen_enc = vector_length_encoding(this);
 6189     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6190   %}
 6191   ins_pipe( pipe_slow );
 6192 %}
 6193 
 6194 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 6195   predicate((UseAVX > 0) &&
 6196             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6197   match(Set dst (MulVD src (LoadVector mem)));
 6198   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6199   ins_encode %{
 6200     int vlen_enc = vector_length_encoding(this);
 6201     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6202   %}
 6203   ins_pipe( pipe_slow );
 6204 %}
 6205 
 6206 // --------------------------------- DIV --------------------------------------
 6207 
 6208 // Floats vector div
 6209 instruct vdivF(vec dst, vec src) %{
 6210   predicate(UseAVX == 0);
 6211   match(Set dst (DivVF dst src));
 6212   format %{ "divps   $dst,$src\t! div packedF" %}
 6213   ins_encode %{
 6214     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6215   %}
 6216   ins_pipe( pipe_slow );
 6217 %}
 6218 
 6219 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6220   predicate(UseAVX > 0);
 6221   match(Set dst (DivVF src1 src2));
 6222   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6223   ins_encode %{
 6224     int vlen_enc = vector_length_encoding(this);
 6225     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6226   %}
 6227   ins_pipe( pipe_slow );
 6228 %}
 6229 
 6230 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6231   predicate((UseAVX > 0) &&
 6232             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6233   match(Set dst (DivVF src (LoadVector mem)));
 6234   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6235   ins_encode %{
 6236     int vlen_enc = vector_length_encoding(this);
 6237     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6238   %}
 6239   ins_pipe( pipe_slow );
 6240 %}
 6241 
 6242 // Doubles vector div
 6243 instruct vdivD(vec dst, vec src) %{
 6244   predicate(UseAVX == 0);
 6245   match(Set dst (DivVD dst src));
 6246   format %{ "divpd   $dst,$src\t! div packedD" %}
 6247   ins_encode %{
 6248     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6249   %}
 6250   ins_pipe( pipe_slow );
 6251 %}
 6252 
 6253 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6254   predicate(UseAVX > 0);
 6255   match(Set dst (DivVD src1 src2));
 6256   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6257   ins_encode %{
 6258     int vlen_enc = vector_length_encoding(this);
 6259     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6260   %}
 6261   ins_pipe( pipe_slow );
 6262 %}
 6263 
 6264 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6265   predicate((UseAVX > 0) &&
 6266             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6267   match(Set dst (DivVD src (LoadVector mem)));
 6268   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6269   ins_encode %{
 6270     int vlen_enc = vector_length_encoding(this);
 6271     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6272   %}
 6273   ins_pipe( pipe_slow );
 6274 %}
 6275 
 6276 // ------------------------------ MinMax ---------------------------------------
 6277 
 6278 // Byte, Short, Int vector Min/Max
 6279 instruct minmax_reg_sse(vec dst, vec src) %{
 6280   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6281             UseAVX == 0);
 6282   match(Set dst (MinV dst src));
 6283   match(Set dst (MaxV dst src));
 6284   format %{ "vector_minmax  $dst,$src\t!  " %}
 6285   ins_encode %{
 6286     assert(UseSSE >= 4, "required");
 6287 
 6288     int opcode = this->ideal_Opcode();
 6289     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6290     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6291   %}
 6292   ins_pipe( pipe_slow );
 6293 %}
 6294 
 6295 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6296   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6297             UseAVX > 0);
 6298   match(Set dst (MinV src1 src2));
 6299   match(Set dst (MaxV src1 src2));
 6300   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6301   ins_encode %{
 6302     int opcode = this->ideal_Opcode();
 6303     int vlen_enc = vector_length_encoding(this);
 6304     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6305 
 6306     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6307   %}
 6308   ins_pipe( pipe_slow );
 6309 %}
 6310 
 6311 // Long vector Min/Max
 6312 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6313   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6314             UseAVX == 0);
 6315   match(Set dst (MinV dst src));
 6316   match(Set dst (MaxV src dst));
 6317   effect(TEMP dst, TEMP tmp);
 6318   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6319   ins_encode %{
 6320     assert(UseSSE >= 4, "required");
 6321 
 6322     int opcode = this->ideal_Opcode();
 6323     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6324     assert(elem_bt == T_LONG, "sanity");
 6325 
 6326     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6327   %}
 6328   ins_pipe( pipe_slow );
 6329 %}
 6330 
 6331 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6332   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6333             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6334   match(Set dst (MinV src1 src2));
 6335   match(Set dst (MaxV src1 src2));
 6336   effect(TEMP dst);
 6337   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6338   ins_encode %{
 6339     int vlen_enc = vector_length_encoding(this);
 6340     int opcode = this->ideal_Opcode();
 6341     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6342     assert(elem_bt == T_LONG, "sanity");
 6343 
 6344     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6345   %}
 6346   ins_pipe( pipe_slow );
 6347 %}
 6348 
 6349 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6350   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6351             Matcher::vector_element_basic_type(n) == T_LONG);
 6352   match(Set dst (MinV src1 src2));
 6353   match(Set dst (MaxV src1 src2));
 6354   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6355   ins_encode %{
 6356     assert(UseAVX > 2, "required");
 6357 
 6358     int vlen_enc = vector_length_encoding(this);
 6359     int opcode = this->ideal_Opcode();
 6360     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6361     assert(elem_bt == T_LONG, "sanity");
 6362 
 6363     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6364   %}
 6365   ins_pipe( pipe_slow );
 6366 %}
 6367 
 6368 // Float/Double vector Min/Max
 6369 instruct minmaxFP_avx10_reg(vec dst, vec a, vec b) %{
 6370   predicate(VM_Version::supports_avx10_2() &&
 6371             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6372   match(Set dst (MinV a b));
 6373   match(Set dst (MaxV a b));
 6374   format %{ "vector_minmaxFP  $dst, $a, $b" %}
 6375   ins_encode %{
 6376     int vlen_enc = vector_length_encoding(this);
 6377     int opcode = this->ideal_Opcode();
 6378     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6379     __ vminmax_fp(opcode, elem_bt, $dst$$XMMRegister, k0, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6380   %}
 6381   ins_pipe( pipe_slow );
 6382 %}
 6383 
 6384 // Float/Double vector Min/Max
 6385 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6386   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) <= 32 &&
 6387             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6388             UseAVX > 0);
 6389   match(Set dst (MinV a b));
 6390   match(Set dst (MaxV a b));
 6391   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6392   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6393   ins_encode %{
 6394     assert(UseAVX > 0, "required");
 6395 
 6396     int opcode = this->ideal_Opcode();
 6397     int vlen_enc = vector_length_encoding(this);
 6398     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6399 
 6400     __ vminmax_fp(opcode, elem_bt,
 6401                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6402                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6403   %}
 6404   ins_pipe( pipe_slow );
 6405 %}
 6406 
 6407 instruct evminmaxFP_reg_evex(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6408   predicate(!VM_Version::supports_avx10_2() && Matcher::vector_length_in_bytes(n) == 64 &&
 6409             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6410   match(Set dst (MinV a b));
 6411   match(Set dst (MaxV a b));
 6412   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6413   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6414   ins_encode %{
 6415     assert(UseAVX > 2, "required");
 6416 
 6417     int opcode = this->ideal_Opcode();
 6418     int vlen_enc = vector_length_encoding(this);
 6419     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6420 
 6421     __ evminmax_fp(opcode, elem_bt,
 6422                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6423                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6424   %}
 6425   ins_pipe( pipe_slow );
 6426 %}
 6427 
 6428 // ------------------------------ Unsigned vector Min/Max ----------------------
 6429 
 6430 instruct vector_uminmax_reg(vec dst, vec a, vec b) %{
 6431   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6432   match(Set dst (UMinV a b));
 6433   match(Set dst (UMaxV a b));
 6434   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6435   ins_encode %{
 6436     int opcode = this->ideal_Opcode();
 6437     int vlen_enc = vector_length_encoding(this);
 6438     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6439     assert(is_integral_type(elem_bt), "");
 6440     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vlen_enc);
 6441   %}
 6442   ins_pipe( pipe_slow );
 6443 %}
 6444 
 6445 instruct vector_uminmax_mem(vec dst, vec a, memory b) %{
 6446   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_LONG);
 6447   match(Set dst (UMinV a (LoadVector b)));
 6448   match(Set dst (UMaxV a (LoadVector b)));
 6449   format %{ "vector_uminmax $dst,$a,$b\t!" %}
 6450   ins_encode %{
 6451     int opcode = this->ideal_Opcode();
 6452     int vlen_enc = vector_length_encoding(this);
 6453     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6454     assert(is_integral_type(elem_bt), "");
 6455     __ vpuminmax(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$Address, vlen_enc);
 6456   %}
 6457   ins_pipe( pipe_slow );
 6458 %}
 6459 
 6460 instruct vector_uminmaxq_reg(vec dst, vec a, vec b, vec xtmp1, vec xtmp2) %{
 6461   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_LONG);
 6462   match(Set dst (UMinV a b));
 6463   match(Set dst (UMaxV a b));
 6464   effect(TEMP xtmp1, TEMP xtmp2);
 6465   format %{ "vector_uminmaxq $dst,$a,$b\t! using xtmp1 and xtmp2 as TEMP" %}
 6466   ins_encode %{
 6467     int opcode = this->ideal_Opcode();
 6468     int vlen_enc = vector_length_encoding(this);
 6469     __ vpuminmaxq(opcode, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 6470   %}
 6471   ins_pipe( pipe_slow );
 6472 %}
 6473 
 6474 instruct vector_uminmax_reg_masked(vec dst, vec src2, kReg mask) %{
 6475   match(Set dst (UMinV (Binary dst src2) mask));
 6476   match(Set dst (UMaxV (Binary dst src2) mask));
 6477   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6478   ins_encode %{
 6479     int vlen_enc = vector_length_encoding(this);
 6480     BasicType bt = Matcher::vector_element_basic_type(this);
 6481     int opc = this->ideal_Opcode();
 6482     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6483                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 6484   %}
 6485   ins_pipe( pipe_slow );
 6486 %}
 6487 
 6488 instruct vector_uminmax_mem_masked(vec dst, memory src2, kReg mask) %{
 6489   match(Set dst (UMinV (Binary dst (LoadVector src2)) mask));
 6490   match(Set dst (UMaxV (Binary dst (LoadVector src2)) mask));
 6491   format %{ "vector_uminmax_masked $dst, $dst, $src2, $mask\t! umin/max masked operation" %}
 6492   ins_encode %{
 6493     int vlen_enc = vector_length_encoding(this);
 6494     BasicType bt = Matcher::vector_element_basic_type(this);
 6495     int opc = this->ideal_Opcode();
 6496     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 6497                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 6498   %}
 6499   ins_pipe( pipe_slow );
 6500 %}
 6501 
 6502 // --------------------------------- Signum/CopySign ---------------------------
 6503 
 6504 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6505   match(Set dst (SignumF dst (Binary zero one)));
 6506   effect(KILL cr);
 6507   format %{ "signumF $dst, $dst" %}
 6508   ins_encode %{
 6509     int opcode = this->ideal_Opcode();
 6510     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6511   %}
 6512   ins_pipe( pipe_slow );
 6513 %}
 6514 
 6515 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6516   match(Set dst (SignumD dst (Binary zero one)));
 6517   effect(KILL cr);
 6518   format %{ "signumD $dst, $dst" %}
 6519   ins_encode %{
 6520     int opcode = this->ideal_Opcode();
 6521     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6522   %}
 6523   ins_pipe( pipe_slow );
 6524 %}
 6525 
 6526 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6527   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6528   match(Set dst (SignumVF src (Binary zero one)));
 6529   match(Set dst (SignumVD src (Binary zero one)));
 6530   effect(TEMP dst, TEMP xtmp1);
 6531   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6532   ins_encode %{
 6533     int opcode = this->ideal_Opcode();
 6534     int vec_enc = vector_length_encoding(this);
 6535     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6536                          $xtmp1$$XMMRegister, vec_enc);
 6537   %}
 6538   ins_pipe( pipe_slow );
 6539 %}
 6540 
 6541 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6542   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6543   match(Set dst (SignumVF src (Binary zero one)));
 6544   match(Set dst (SignumVD src (Binary zero one)));
 6545   effect(TEMP dst, TEMP ktmp1);
 6546   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6547   ins_encode %{
 6548     int opcode = this->ideal_Opcode();
 6549     int vec_enc = vector_length_encoding(this);
 6550     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6551                           $ktmp1$$KRegister, vec_enc);
 6552   %}
 6553   ins_pipe( pipe_slow );
 6554 %}
 6555 
 6556 // ---------------------------------------
 6557 // For copySign use 0xE4 as writemask for vpternlog
 6558 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6559 // C (xmm2) is set to 0x7FFFFFFF
 6560 // Wherever xmm2 is 0, we want to pick from B (sign)
 6561 // Wherever xmm2 is 1, we want to pick from A (src)
 6562 //
 6563 // A B C Result
 6564 // 0 0 0 0
 6565 // 0 0 1 0
 6566 // 0 1 0 1
 6567 // 0 1 1 0
 6568 // 1 0 0 0
 6569 // 1 0 1 1
 6570 // 1 1 0 1
 6571 // 1 1 1 1
 6572 //
 6573 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6574 // ---------------------------------------
 6575 
 6576 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6577   match(Set dst (CopySignF dst src));
 6578   effect(TEMP tmp1, TEMP tmp2);
 6579   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6580   ins_encode %{
 6581     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6582     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6583     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6584   %}
 6585   ins_pipe( pipe_slow );
 6586 %}
 6587 
 6588 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6589   match(Set dst (CopySignD dst (Binary src zero)));
 6590   ins_cost(100);
 6591   effect(TEMP tmp1, TEMP tmp2);
 6592   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6593   ins_encode %{
 6594     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6595     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6596     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6597   %}
 6598   ins_pipe( pipe_slow );
 6599 %}
 6600 
 6601 //----------------------------- CompressBits/ExpandBits ------------------------
 6602 
 6603 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6604   predicate(n->bottom_type()->isa_int());
 6605   match(Set dst (CompressBits src mask));
 6606   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6607   ins_encode %{
 6608     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6609   %}
 6610   ins_pipe( pipe_slow );
 6611 %}
 6612 
 6613 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6614   predicate(n->bottom_type()->isa_int());
 6615   match(Set dst (ExpandBits src mask));
 6616   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6617   ins_encode %{
 6618     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6619   %}
 6620   ins_pipe( pipe_slow );
 6621 %}
 6622 
 6623 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6624   predicate(n->bottom_type()->isa_int());
 6625   match(Set dst (CompressBits src (LoadI mask)));
 6626   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6627   ins_encode %{
 6628     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6629   %}
 6630   ins_pipe( pipe_slow );
 6631 %}
 6632 
 6633 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6634   predicate(n->bottom_type()->isa_int());
 6635   match(Set dst (ExpandBits src (LoadI mask)));
 6636   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6637   ins_encode %{
 6638     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6639   %}
 6640   ins_pipe( pipe_slow );
 6641 %}
 6642 
 6643 // --------------------------------- Sqrt --------------------------------------
 6644 
 6645 instruct vsqrtF_reg(vec dst, vec src) %{
 6646   match(Set dst (SqrtVF src));
 6647   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6648   ins_encode %{
 6649     assert(UseAVX > 0, "required");
 6650     int vlen_enc = vector_length_encoding(this);
 6651     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6652   %}
 6653   ins_pipe( pipe_slow );
 6654 %}
 6655 
 6656 instruct vsqrtF_mem(vec dst, memory mem) %{
 6657   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6658   match(Set dst (SqrtVF (LoadVector mem)));
 6659   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6660   ins_encode %{
 6661     assert(UseAVX > 0, "required");
 6662     int vlen_enc = vector_length_encoding(this);
 6663     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6664   %}
 6665   ins_pipe( pipe_slow );
 6666 %}
 6667 
 6668 // Floating point vector sqrt
 6669 instruct vsqrtD_reg(vec dst, vec src) %{
 6670   match(Set dst (SqrtVD src));
 6671   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6672   ins_encode %{
 6673     assert(UseAVX > 0, "required");
 6674     int vlen_enc = vector_length_encoding(this);
 6675     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6676   %}
 6677   ins_pipe( pipe_slow );
 6678 %}
 6679 
 6680 instruct vsqrtD_mem(vec dst, memory mem) %{
 6681   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6682   match(Set dst (SqrtVD (LoadVector mem)));
 6683   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6684   ins_encode %{
 6685     assert(UseAVX > 0, "required");
 6686     int vlen_enc = vector_length_encoding(this);
 6687     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6688   %}
 6689   ins_pipe( pipe_slow );
 6690 %}
 6691 
 6692 // ------------------------------ Shift ---------------------------------------
 6693 
 6694 // Left and right shift count vectors are the same on x86
 6695 // (only lowest bits of xmm reg are used for count).
 6696 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6697   match(Set dst (LShiftCntV cnt));
 6698   match(Set dst (RShiftCntV cnt));
 6699   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6700   ins_encode %{
 6701     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6702   %}
 6703   ins_pipe( pipe_slow );
 6704 %}
 6705 
 6706 // Byte vector shift
 6707 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6708   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6709   match(Set dst ( LShiftVB src shift));
 6710   match(Set dst ( RShiftVB src shift));
 6711   match(Set dst (URShiftVB src shift));
 6712   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6713   format %{"vector_byte_shift $dst,$src,$shift" %}
 6714   ins_encode %{
 6715     assert(UseSSE > 3, "required");
 6716     int opcode = this->ideal_Opcode();
 6717     bool sign = (opcode != Op_URShiftVB);
 6718     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6719     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6720     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6721     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6722     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6723   %}
 6724   ins_pipe( pipe_slow );
 6725 %}
 6726 
 6727 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6728   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6729             UseAVX <= 1);
 6730   match(Set dst ( LShiftVB src shift));
 6731   match(Set dst ( RShiftVB src shift));
 6732   match(Set dst (URShiftVB src shift));
 6733   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6734   format %{"vector_byte_shift $dst,$src,$shift" %}
 6735   ins_encode %{
 6736     assert(UseSSE > 3, "required");
 6737     int opcode = this->ideal_Opcode();
 6738     bool sign = (opcode != Op_URShiftVB);
 6739     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6740     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6741     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6742     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6743     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6744     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6745     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6746     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6747     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6748   %}
 6749   ins_pipe( pipe_slow );
 6750 %}
 6751 
 6752 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6753   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6754             UseAVX > 1);
 6755   match(Set dst ( LShiftVB src shift));
 6756   match(Set dst ( RShiftVB src shift));
 6757   match(Set dst (URShiftVB src shift));
 6758   effect(TEMP dst, TEMP tmp);
 6759   format %{"vector_byte_shift $dst,$src,$shift" %}
 6760   ins_encode %{
 6761     int opcode = this->ideal_Opcode();
 6762     bool sign = (opcode != Op_URShiftVB);
 6763     int vlen_enc = Assembler::AVX_256bit;
 6764     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6765     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6766     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6767     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6768     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6769   %}
 6770   ins_pipe( pipe_slow );
 6771 %}
 6772 
 6773 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6774   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6775   match(Set dst ( LShiftVB src shift));
 6776   match(Set dst ( RShiftVB src shift));
 6777   match(Set dst (URShiftVB src shift));
 6778   effect(TEMP dst, TEMP tmp);
 6779   format %{"vector_byte_shift $dst,$src,$shift" %}
 6780   ins_encode %{
 6781     assert(UseAVX > 1, "required");
 6782     int opcode = this->ideal_Opcode();
 6783     bool sign = (opcode != Op_URShiftVB);
 6784     int vlen_enc = Assembler::AVX_256bit;
 6785     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6786     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6787     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6788     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6789     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6790     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6791     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6792     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6793     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6794   %}
 6795   ins_pipe( pipe_slow );
 6796 %}
 6797 
 6798 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6799   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6800   match(Set dst ( LShiftVB src shift));
 6801   match(Set dst  (RShiftVB src shift));
 6802   match(Set dst (URShiftVB src shift));
 6803   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6804   format %{"vector_byte_shift $dst,$src,$shift" %}
 6805   ins_encode %{
 6806     assert(UseAVX > 2, "required");
 6807     int opcode = this->ideal_Opcode();
 6808     bool sign = (opcode != Op_URShiftVB);
 6809     int vlen_enc = Assembler::AVX_512bit;
 6810     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6811     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6812     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6813     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6814     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6815     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6816     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6817     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6818     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6819     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6820     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6821     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6822   %}
 6823   ins_pipe( pipe_slow );
 6824 %}
 6825 
 6826 // Shorts vector logical right shift produces incorrect Java result
 6827 // for negative data because java code convert short value into int with
 6828 // sign extension before a shift. But char vectors are fine since chars are
 6829 // unsigned values.
 6830 // Shorts/Chars vector left shift
 6831 instruct vshiftS(vec dst, vec src, vec shift) %{
 6832   predicate(!n->as_ShiftV()->is_var_shift());
 6833   match(Set dst ( LShiftVS src shift));
 6834   match(Set dst ( RShiftVS src shift));
 6835   match(Set dst (URShiftVS src shift));
 6836   effect(TEMP dst, USE src, USE shift);
 6837   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6838   ins_encode %{
 6839     int opcode = this->ideal_Opcode();
 6840     if (UseAVX > 0) {
 6841       int vlen_enc = vector_length_encoding(this);
 6842       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6843     } else {
 6844       int vlen = Matcher::vector_length(this);
 6845       if (vlen == 2) {
 6846         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6847         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6848       } else if (vlen == 4) {
 6849         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6850         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6851       } else {
 6852         assert (vlen == 8, "sanity");
 6853         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6854         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6855       }
 6856     }
 6857   %}
 6858   ins_pipe( pipe_slow );
 6859 %}
 6860 
 6861 // Integers vector left shift
 6862 instruct vshiftI(vec dst, vec src, vec shift) %{
 6863   predicate(!n->as_ShiftV()->is_var_shift());
 6864   match(Set dst ( LShiftVI src shift));
 6865   match(Set dst ( RShiftVI src shift));
 6866   match(Set dst (URShiftVI src shift));
 6867   effect(TEMP dst, USE src, USE shift);
 6868   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6869   ins_encode %{
 6870     int opcode = this->ideal_Opcode();
 6871     if (UseAVX > 0) {
 6872       int vlen_enc = vector_length_encoding(this);
 6873       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6874     } else {
 6875       int vlen = Matcher::vector_length(this);
 6876       if (vlen == 2) {
 6877         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6878         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6879       } else {
 6880         assert(vlen == 4, "sanity");
 6881         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6882         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6883       }
 6884     }
 6885   %}
 6886   ins_pipe( pipe_slow );
 6887 %}
 6888 
 6889 // Integers vector left constant shift
 6890 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6891   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6892   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6893   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6894   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6895   ins_encode %{
 6896     int opcode = this->ideal_Opcode();
 6897     if (UseAVX > 0) {
 6898       int vector_len = vector_length_encoding(this);
 6899       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6900     } else {
 6901       int vlen = Matcher::vector_length(this);
 6902       if (vlen == 2) {
 6903         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6904         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6905       } else {
 6906         assert(vlen == 4, "sanity");
 6907         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6908         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6909       }
 6910     }
 6911   %}
 6912   ins_pipe( pipe_slow );
 6913 %}
 6914 
 6915 // Longs vector shift
 6916 instruct vshiftL(vec dst, vec src, vec shift) %{
 6917   predicate(!n->as_ShiftV()->is_var_shift());
 6918   match(Set dst ( LShiftVL src shift));
 6919   match(Set dst (URShiftVL src shift));
 6920   effect(TEMP dst, USE src, USE shift);
 6921   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6922   ins_encode %{
 6923     int opcode = this->ideal_Opcode();
 6924     if (UseAVX > 0) {
 6925       int vlen_enc = vector_length_encoding(this);
 6926       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6927     } else {
 6928       assert(Matcher::vector_length(this) == 2, "");
 6929       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6930       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6931     }
 6932   %}
 6933   ins_pipe( pipe_slow );
 6934 %}
 6935 
 6936 // Longs vector constant shift
 6937 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6938   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6939   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6940   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6941   ins_encode %{
 6942     int opcode = this->ideal_Opcode();
 6943     if (UseAVX > 0) {
 6944       int vector_len = vector_length_encoding(this);
 6945       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6946     } else {
 6947       assert(Matcher::vector_length(this) == 2, "");
 6948       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6949       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6950     }
 6951   %}
 6952   ins_pipe( pipe_slow );
 6953 %}
 6954 
 6955 // -------------------ArithmeticRightShift -----------------------------------
 6956 // Long vector arithmetic right shift
 6957 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6958   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6959   match(Set dst (RShiftVL src shift));
 6960   effect(TEMP dst, TEMP tmp);
 6961   format %{ "vshiftq $dst,$src,$shift" %}
 6962   ins_encode %{
 6963     uint vlen = Matcher::vector_length(this);
 6964     if (vlen == 2) {
 6965       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6966       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6967       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6968       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6969       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6970       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6971     } else {
 6972       assert(vlen == 4, "sanity");
 6973       assert(UseAVX > 1, "required");
 6974       int vlen_enc = Assembler::AVX_256bit;
 6975       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6976       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6977       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6978       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6979       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6980     }
 6981   %}
 6982   ins_pipe( pipe_slow );
 6983 %}
 6984 
 6985 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6986   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6987   match(Set dst (RShiftVL src shift));
 6988   format %{ "vshiftq $dst,$src,$shift" %}
 6989   ins_encode %{
 6990     int vlen_enc = vector_length_encoding(this);
 6991     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6992   %}
 6993   ins_pipe( pipe_slow );
 6994 %}
 6995 
 6996 // ------------------- Variable Shift -----------------------------
 6997 // Byte variable shift
 6998 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6999   predicate(Matcher::vector_length(n) <= 8 &&
 7000             n->as_ShiftV()->is_var_shift() &&
 7001             !VM_Version::supports_avx512bw());
 7002   match(Set dst ( LShiftVB src shift));
 7003   match(Set dst ( RShiftVB src shift));
 7004   match(Set dst (URShiftVB src shift));
 7005   effect(TEMP dst, TEMP vtmp);
 7006   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7007   ins_encode %{
 7008     assert(UseAVX >= 2, "required");
 7009 
 7010     int opcode = this->ideal_Opcode();
 7011     int vlen_enc = Assembler::AVX_128bit;
 7012     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7013     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7014   %}
 7015   ins_pipe( pipe_slow );
 7016 %}
 7017 
 7018 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7019   predicate(Matcher::vector_length(n) == 16 &&
 7020             n->as_ShiftV()->is_var_shift() &&
 7021             !VM_Version::supports_avx512bw());
 7022   match(Set dst ( LShiftVB src shift));
 7023   match(Set dst ( RShiftVB src shift));
 7024   match(Set dst (URShiftVB src shift));
 7025   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7026   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7027   ins_encode %{
 7028     assert(UseAVX >= 2, "required");
 7029 
 7030     int opcode = this->ideal_Opcode();
 7031     int vlen_enc = Assembler::AVX_128bit;
 7032     // Shift lower half and get word result in dst
 7033     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7034 
 7035     // Shift upper half and get word result in vtmp1
 7036     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7037     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7038     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7039 
 7040     // Merge and down convert the two word results to byte in dst
 7041     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7042   %}
 7043   ins_pipe( pipe_slow );
 7044 %}
 7045 
 7046 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 7047   predicate(Matcher::vector_length(n) == 32 &&
 7048             n->as_ShiftV()->is_var_shift() &&
 7049             !VM_Version::supports_avx512bw());
 7050   match(Set dst ( LShiftVB src shift));
 7051   match(Set dst ( RShiftVB src shift));
 7052   match(Set dst (URShiftVB src shift));
 7053   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 7054   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 7055   ins_encode %{
 7056     assert(UseAVX >= 2, "required");
 7057 
 7058     int opcode = this->ideal_Opcode();
 7059     int vlen_enc = Assembler::AVX_128bit;
 7060     // Process lower 128 bits and get result in dst
 7061     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7062     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 7063     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 7064     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7065     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7066 
 7067     // Process higher 128 bits and get result in vtmp3
 7068     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7069     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7070     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 7071     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 7072     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 7073     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7074     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 7075 
 7076     // Merge the two results in dst
 7077     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7078   %}
 7079   ins_pipe( pipe_slow );
 7080 %}
 7081 
 7082 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 7083   predicate(Matcher::vector_length(n) <= 32 &&
 7084             n->as_ShiftV()->is_var_shift() &&
 7085             VM_Version::supports_avx512bw());
 7086   match(Set dst ( LShiftVB src shift));
 7087   match(Set dst ( RShiftVB src shift));
 7088   match(Set dst (URShiftVB src shift));
 7089   effect(TEMP dst, TEMP vtmp);
 7090   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 7091   ins_encode %{
 7092     assert(UseAVX > 2, "required");
 7093 
 7094     int opcode = this->ideal_Opcode();
 7095     int vlen_enc = vector_length_encoding(this);
 7096     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 7097   %}
 7098   ins_pipe( pipe_slow );
 7099 %}
 7100 
 7101 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7102   predicate(Matcher::vector_length(n) == 64 &&
 7103             n->as_ShiftV()->is_var_shift() &&
 7104             VM_Version::supports_avx512bw());
 7105   match(Set dst ( LShiftVB src shift));
 7106   match(Set dst ( RShiftVB src shift));
 7107   match(Set dst (URShiftVB src shift));
 7108   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7109   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 7110   ins_encode %{
 7111     assert(UseAVX > 2, "required");
 7112 
 7113     int opcode = this->ideal_Opcode();
 7114     int vlen_enc = Assembler::AVX_256bit;
 7115     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 7116     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 7117     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 7118     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 7119     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 7120   %}
 7121   ins_pipe( pipe_slow );
 7122 %}
 7123 
 7124 // Short variable shift
 7125 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 7126   predicate(Matcher::vector_length(n) <= 8 &&
 7127             n->as_ShiftV()->is_var_shift() &&
 7128             !VM_Version::supports_avx512bw());
 7129   match(Set dst ( LShiftVS src shift));
 7130   match(Set dst ( RShiftVS src shift));
 7131   match(Set dst (URShiftVS src shift));
 7132   effect(TEMP dst, TEMP vtmp);
 7133   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7134   ins_encode %{
 7135     assert(UseAVX >= 2, "required");
 7136 
 7137     int opcode = this->ideal_Opcode();
 7138     bool sign = (opcode != Op_URShiftVS);
 7139     int vlen_enc = Assembler::AVX_256bit;
 7140     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 7141     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 7142     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 7143     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7144     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 7145     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7146   %}
 7147   ins_pipe( pipe_slow );
 7148 %}
 7149 
 7150 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 7151   predicate(Matcher::vector_length(n) == 16 &&
 7152             n->as_ShiftV()->is_var_shift() &&
 7153             !VM_Version::supports_avx512bw());
 7154   match(Set dst ( LShiftVS src shift));
 7155   match(Set dst ( RShiftVS src shift));
 7156   match(Set dst (URShiftVS src shift));
 7157   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 7158   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 7159   ins_encode %{
 7160     assert(UseAVX >= 2, "required");
 7161 
 7162     int opcode = this->ideal_Opcode();
 7163     bool sign = (opcode != Op_URShiftVS);
 7164     int vlen_enc = Assembler::AVX_256bit;
 7165     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 7166     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7167     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7168     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7169     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7170 
 7171     // Shift upper half, with result in dst using vtmp1 as TEMP
 7172     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 7173     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 7174     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7175     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7176     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 7177     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7178 
 7179     // Merge lower and upper half result into dst
 7180     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7181     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 7182   %}
 7183   ins_pipe( pipe_slow );
 7184 %}
 7185 
 7186 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 7187   predicate(n->as_ShiftV()->is_var_shift() &&
 7188             VM_Version::supports_avx512bw());
 7189   match(Set dst ( LShiftVS src shift));
 7190   match(Set dst ( RShiftVS src shift));
 7191   match(Set dst (URShiftVS src shift));
 7192   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 7193   ins_encode %{
 7194     assert(UseAVX > 2, "required");
 7195 
 7196     int opcode = this->ideal_Opcode();
 7197     int vlen_enc = vector_length_encoding(this);
 7198     if (!VM_Version::supports_avx512vl()) {
 7199       vlen_enc = Assembler::AVX_512bit;
 7200     }
 7201     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7202   %}
 7203   ins_pipe( pipe_slow );
 7204 %}
 7205 
 7206 //Integer variable shift
 7207 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 7208   predicate(n->as_ShiftV()->is_var_shift());
 7209   match(Set dst ( LShiftVI src shift));
 7210   match(Set dst ( RShiftVI src shift));
 7211   match(Set dst (URShiftVI src shift));
 7212   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 7213   ins_encode %{
 7214     assert(UseAVX >= 2, "required");
 7215 
 7216     int opcode = this->ideal_Opcode();
 7217     int vlen_enc = vector_length_encoding(this);
 7218     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7219   %}
 7220   ins_pipe( pipe_slow );
 7221 %}
 7222 
 7223 //Long variable shift
 7224 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 7225   predicate(n->as_ShiftV()->is_var_shift());
 7226   match(Set dst ( LShiftVL src shift));
 7227   match(Set dst (URShiftVL src shift));
 7228   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 7229   ins_encode %{
 7230     assert(UseAVX >= 2, "required");
 7231 
 7232     int opcode = this->ideal_Opcode();
 7233     int vlen_enc = vector_length_encoding(this);
 7234     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7235   %}
 7236   ins_pipe( pipe_slow );
 7237 %}
 7238 
 7239 //Long variable right shift arithmetic
 7240 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 7241   predicate(Matcher::vector_length(n) <= 4 &&
 7242             n->as_ShiftV()->is_var_shift() &&
 7243             UseAVX == 2);
 7244   match(Set dst (RShiftVL src shift));
 7245   effect(TEMP dst, TEMP vtmp);
 7246   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 7247   ins_encode %{
 7248     int opcode = this->ideal_Opcode();
 7249     int vlen_enc = vector_length_encoding(this);
 7250     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7251                  $vtmp$$XMMRegister);
 7252   %}
 7253   ins_pipe( pipe_slow );
 7254 %}
 7255 
 7256 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7257   predicate(n->as_ShiftV()->is_var_shift() &&
 7258             UseAVX > 2);
 7259   match(Set dst (RShiftVL src shift));
 7260   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7261   ins_encode %{
 7262     int opcode = this->ideal_Opcode();
 7263     int vlen_enc = vector_length_encoding(this);
 7264     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7265   %}
 7266   ins_pipe( pipe_slow );
 7267 %}
 7268 
 7269 // --------------------------------- AND --------------------------------------
 7270 
 7271 instruct vand(vec dst, vec src) %{
 7272   predicate(UseAVX == 0);
 7273   match(Set dst (AndV dst src));
 7274   format %{ "pand    $dst,$src\t! and vectors" %}
 7275   ins_encode %{
 7276     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7277   %}
 7278   ins_pipe( pipe_slow );
 7279 %}
 7280 
 7281 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7282   predicate(UseAVX > 0);
 7283   match(Set dst (AndV src1 src2));
 7284   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7285   ins_encode %{
 7286     int vlen_enc = vector_length_encoding(this);
 7287     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7288   %}
 7289   ins_pipe( pipe_slow );
 7290 %}
 7291 
 7292 instruct vand_mem(vec dst, vec src, memory mem) %{
 7293   predicate((UseAVX > 0) &&
 7294             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7295   match(Set dst (AndV src (LoadVector mem)));
 7296   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7297   ins_encode %{
 7298     int vlen_enc = vector_length_encoding(this);
 7299     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7300   %}
 7301   ins_pipe( pipe_slow );
 7302 %}
 7303 
 7304 // --------------------------------- OR ---------------------------------------
 7305 
 7306 instruct vor(vec dst, vec src) %{
 7307   predicate(UseAVX == 0);
 7308   match(Set dst (OrV dst src));
 7309   format %{ "por     $dst,$src\t! or vectors" %}
 7310   ins_encode %{
 7311     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7312   %}
 7313   ins_pipe( pipe_slow );
 7314 %}
 7315 
 7316 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7317   predicate(UseAVX > 0);
 7318   match(Set dst (OrV src1 src2));
 7319   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7320   ins_encode %{
 7321     int vlen_enc = vector_length_encoding(this);
 7322     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7323   %}
 7324   ins_pipe( pipe_slow );
 7325 %}
 7326 
 7327 instruct vor_mem(vec dst, vec src, memory mem) %{
 7328   predicate((UseAVX > 0) &&
 7329             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7330   match(Set dst (OrV src (LoadVector mem)));
 7331   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7332   ins_encode %{
 7333     int vlen_enc = vector_length_encoding(this);
 7334     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7335   %}
 7336   ins_pipe( pipe_slow );
 7337 %}
 7338 
 7339 // --------------------------------- XOR --------------------------------------
 7340 
 7341 instruct vxor(vec dst, vec src) %{
 7342   predicate(UseAVX == 0);
 7343   match(Set dst (XorV dst src));
 7344   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7345   ins_encode %{
 7346     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7347   %}
 7348   ins_pipe( pipe_slow );
 7349 %}
 7350 
 7351 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7352   predicate(UseAVX > 0);
 7353   match(Set dst (XorV src1 src2));
 7354   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7355   ins_encode %{
 7356     int vlen_enc = vector_length_encoding(this);
 7357     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7358   %}
 7359   ins_pipe( pipe_slow );
 7360 %}
 7361 
 7362 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7363   predicate((UseAVX > 0) &&
 7364             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7365   match(Set dst (XorV src (LoadVector mem)));
 7366   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7367   ins_encode %{
 7368     int vlen_enc = vector_length_encoding(this);
 7369     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7370   %}
 7371   ins_pipe( pipe_slow );
 7372 %}
 7373 
 7374 // --------------------------------- VectorCast --------------------------------------
 7375 
 7376 instruct vcastBtoX(vec dst, vec src) %{
 7377   predicate(VM_Version::supports_avx512vl() || Matcher::vector_element_basic_type(n) != T_DOUBLE);
 7378   match(Set dst (VectorCastB2X src));
 7379   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7380   ins_encode %{
 7381     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7382     int vlen_enc = vector_length_encoding(this);
 7383     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7384   %}
 7385   ins_pipe( pipe_slow );
 7386 %}
 7387 
 7388 instruct vcastBtoD(legVec dst, legVec src) %{
 7389   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7390   match(Set dst (VectorCastB2X src));
 7391   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7392   ins_encode %{
 7393     int vlen_enc = vector_length_encoding(this);
 7394     __ vconvert_b2x(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7395   %}
 7396   ins_pipe( pipe_slow );
 7397 %}
 7398 
 7399 instruct castStoX(vec dst, vec src) %{
 7400   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7401             Matcher::vector_length(n->in(1)) <= 8 && // src
 7402             Matcher::vector_element_basic_type(n) == T_BYTE);
 7403   match(Set dst (VectorCastS2X src));
 7404   format %{ "vector_cast_s2x $dst,$src" %}
 7405   ins_encode %{
 7406     assert(UseAVX > 0, "required");
 7407 
 7408     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7409     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7410   %}
 7411   ins_pipe( pipe_slow );
 7412 %}
 7413 
 7414 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7415   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7416             Matcher::vector_length(n->in(1)) == 16 && // src
 7417             Matcher::vector_element_basic_type(n) == T_BYTE);
 7418   effect(TEMP dst, TEMP vtmp);
 7419   match(Set dst (VectorCastS2X src));
 7420   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7421   ins_encode %{
 7422     assert(UseAVX > 0, "required");
 7423 
 7424     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7425     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7426     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7427     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7428   %}
 7429   ins_pipe( pipe_slow );
 7430 %}
 7431 
 7432 instruct vcastStoX_evex(vec dst, vec src) %{
 7433   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7434             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7435   match(Set dst (VectorCastS2X src));
 7436   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7437   ins_encode %{
 7438     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7439     int src_vlen_enc = vector_length_encoding(this, $src);
 7440     int vlen_enc = vector_length_encoding(this);
 7441     switch (to_elem_bt) {
 7442       case T_BYTE:
 7443         if (!VM_Version::supports_avx512vl()) {
 7444           vlen_enc = Assembler::AVX_512bit;
 7445         }
 7446         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7447         break;
 7448       case T_INT:
 7449         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7450         break;
 7451       case T_FLOAT:
 7452         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7453         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7454         break;
 7455       case T_LONG:
 7456         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7457         break;
 7458       case T_DOUBLE: {
 7459         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7460         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7461         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7462         break;
 7463       }
 7464       default:
 7465         ShouldNotReachHere();
 7466     }
 7467   %}
 7468   ins_pipe( pipe_slow );
 7469 %}
 7470 
 7471 instruct castItoX(vec dst, vec src) %{
 7472   predicate(UseAVX <= 2 &&
 7473             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7474             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7475   match(Set dst (VectorCastI2X src));
 7476   format %{ "vector_cast_i2x $dst,$src" %}
 7477   ins_encode %{
 7478     assert(UseAVX > 0, "required");
 7479 
 7480     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7481     int vlen_enc = vector_length_encoding(this, $src);
 7482 
 7483     if (to_elem_bt == T_BYTE) {
 7484       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7485       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7486       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7487     } else {
 7488       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7489       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7490       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7491     }
 7492   %}
 7493   ins_pipe( pipe_slow );
 7494 %}
 7495 
 7496 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7497   predicate(UseAVX <= 2 &&
 7498             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7499             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7500   match(Set dst (VectorCastI2X src));
 7501   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7502   effect(TEMP dst, TEMP vtmp);
 7503   ins_encode %{
 7504     assert(UseAVX > 0, "required");
 7505 
 7506     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7507     int vlen_enc = vector_length_encoding(this, $src);
 7508 
 7509     if (to_elem_bt == T_BYTE) {
 7510       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7511       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7512       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7513       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7514     } else {
 7515       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7516       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7517       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7518       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7519     }
 7520   %}
 7521   ins_pipe( pipe_slow );
 7522 %}
 7523 
 7524 instruct vcastItoX_evex(vec dst, vec src) %{
 7525   predicate(UseAVX > 2 ||
 7526             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7527   match(Set dst (VectorCastI2X src));
 7528   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7529   ins_encode %{
 7530     assert(UseAVX > 0, "required");
 7531 
 7532     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7533     int src_vlen_enc = vector_length_encoding(this, $src);
 7534     int dst_vlen_enc = vector_length_encoding(this);
 7535     switch (dst_elem_bt) {
 7536       case T_BYTE:
 7537         if (!VM_Version::supports_avx512vl()) {
 7538           src_vlen_enc = Assembler::AVX_512bit;
 7539         }
 7540         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7541         break;
 7542       case T_SHORT:
 7543         if (!VM_Version::supports_avx512vl()) {
 7544           src_vlen_enc = Assembler::AVX_512bit;
 7545         }
 7546         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7547         break;
 7548       case T_FLOAT:
 7549         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7550         break;
 7551       case T_LONG:
 7552         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7553         break;
 7554       case T_DOUBLE:
 7555         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7556         break;
 7557       default:
 7558         ShouldNotReachHere();
 7559     }
 7560   %}
 7561   ins_pipe( pipe_slow );
 7562 %}
 7563 
 7564 instruct vcastLtoBS(vec dst, vec src) %{
 7565   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7566             UseAVX <= 2);
 7567   match(Set dst (VectorCastL2X src));
 7568   format %{ "vector_cast_l2x  $dst,$src" %}
 7569   ins_encode %{
 7570     assert(UseAVX > 0, "required");
 7571 
 7572     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7573     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7574     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7575                                                       : ExternalAddress(vector_int_to_short_mask());
 7576     if (vlen <= 16) {
 7577       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7578       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7579       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7580     } else {
 7581       assert(vlen <= 32, "required");
 7582       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7583       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7584       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7585       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7586     }
 7587     if (to_elem_bt == T_BYTE) {
 7588       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7589     }
 7590   %}
 7591   ins_pipe( pipe_slow );
 7592 %}
 7593 
 7594 instruct vcastLtoX_evex(vec dst, vec src) %{
 7595   predicate(UseAVX > 2 ||
 7596             (Matcher::vector_element_basic_type(n) == T_INT ||
 7597              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7598              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7599   match(Set dst (VectorCastL2X src));
 7600   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7601   ins_encode %{
 7602     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7603     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7604     int vlen_enc = vector_length_encoding(this, $src);
 7605     switch (to_elem_bt) {
 7606       case T_BYTE:
 7607         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7608           vlen_enc = Assembler::AVX_512bit;
 7609         }
 7610         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7611         break;
 7612       case T_SHORT:
 7613         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7614           vlen_enc = Assembler::AVX_512bit;
 7615         }
 7616         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7617         break;
 7618       case T_INT:
 7619         if (vlen == 8) {
 7620           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7621             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7622           }
 7623         } else if (vlen == 16) {
 7624           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7625         } else if (vlen == 32) {
 7626           if (UseAVX > 2) {
 7627             if (!VM_Version::supports_avx512vl()) {
 7628               vlen_enc = Assembler::AVX_512bit;
 7629             }
 7630             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7631           } else {
 7632             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7633             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7634           }
 7635         } else { // vlen == 64
 7636           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7637         }
 7638         break;
 7639       case T_FLOAT:
 7640         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7641         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7642         break;
 7643       case T_DOUBLE:
 7644         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7645         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7646         break;
 7647 
 7648       default: assert(false, "%s", type2name(to_elem_bt));
 7649     }
 7650   %}
 7651   ins_pipe( pipe_slow );
 7652 %}
 7653 
 7654 instruct vcastFtoD_reg(vec dst, vec src) %{
 7655   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7656   match(Set dst (VectorCastF2X src));
 7657   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7658   ins_encode %{
 7659     int vlen_enc = vector_length_encoding(this);
 7660     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7661   %}
 7662   ins_pipe( pipe_slow );
 7663 %}
 7664 
 7665 
 7666 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7667   predicate(!VM_Version::supports_avx10_2() &&
 7668             !VM_Version::supports_avx512vl() &&
 7669             Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7670             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4 &&
 7671             is_integral_type(Matcher::vector_element_basic_type(n)));
 7672   match(Set dst (VectorCastF2X src));
 7673   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7674   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7675   ins_encode %{
 7676     int vlen_enc = vector_length_encoding(this, $src);
 7677     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7678     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7679     // 32 bit addresses for register indirect addressing mode since stub constants
 7680     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7681     // However, targets are free to increase this limit, but having a large code cache size
 7682     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7683     // cap we save a temporary register allocation which in limiting case can prevent
 7684     // spilling in high register pressure blocks.
 7685     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7686                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7687                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7688   %}
 7689   ins_pipe( pipe_slow );
 7690 %}
 7691 
 7692 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7693   predicate(!VM_Version::supports_avx10_2() &&
 7694             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7695             is_integral_type(Matcher::vector_element_basic_type(n)));
 7696   match(Set dst (VectorCastF2X src));
 7697   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7698   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7699   ins_encode %{
 7700     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7701     if (to_elem_bt == T_LONG) {
 7702       int vlen_enc = vector_length_encoding(this);
 7703       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7704                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7705                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7706     } else {
 7707       int vlen_enc = vector_length_encoding(this, $src);
 7708       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7709                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7710                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7711     }
 7712   %}
 7713   ins_pipe( pipe_slow );
 7714 %}
 7715 
 7716 instruct castFtoX_reg_avx10(vec dst, vec src) %{
 7717   predicate(VM_Version::supports_avx10_2() &&
 7718             is_integral_type(Matcher::vector_element_basic_type(n)));
 7719   match(Set dst (VectorCastF2X src));
 7720   format %{ "vector_cast_f2x_avx10 $dst, $src\t!" %}
 7721   ins_encode %{
 7722     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7723     int vlen_enc = (to_elem_bt == T_LONG) ? vector_length_encoding(this) : vector_length_encoding(this, $src);
 7724     __ vector_castF2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7725   %}
 7726   ins_pipe( pipe_slow );
 7727 %}
 7728 
 7729 instruct castFtoX_mem_avx10(vec dst, memory src) %{
 7730   predicate(VM_Version::supports_avx10_2() &&
 7731             is_integral_type(Matcher::vector_element_basic_type(n)));
 7732   match(Set dst (VectorCastF2X (LoadVector src)));
 7733   format %{ "vector_cast_f2x_avx10 $dst, $src\t!" %}
 7734   ins_encode %{
 7735     int vlen = Matcher::vector_length(this);
 7736     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7737     int vlen_enc = (to_elem_bt == T_LONG) ? vector_length_encoding(this) : vector_length_encoding(vlen * sizeof(jfloat));
 7738     __ vector_castF2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$Address, vlen_enc);
 7739   %}
 7740   ins_pipe( pipe_slow );
 7741 %}
 7742 
 7743 instruct vcastDtoF_reg(vec dst, vec src) %{
 7744   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7745   match(Set dst (VectorCastD2X src));
 7746   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7747   ins_encode %{
 7748     int vlen_enc = vector_length_encoding(this, $src);
 7749     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7750   %}
 7751   ins_pipe( pipe_slow );
 7752 %}
 7753 
 7754 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7755   predicate(!VM_Version::supports_avx10_2() &&
 7756             !VM_Version::supports_avx512vl() &&
 7757             Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7758             is_integral_type(Matcher::vector_element_basic_type(n)));
 7759   match(Set dst (VectorCastD2X src));
 7760   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7761   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7762   ins_encode %{
 7763     int vlen_enc = vector_length_encoding(this, $src);
 7764     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7765     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7766                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7767                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7768   %}
 7769   ins_pipe( pipe_slow );
 7770 %}
 7771 
 7772 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7773   predicate(!VM_Version::supports_avx10_2() &&
 7774             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7775             is_integral_type(Matcher::vector_element_basic_type(n)));
 7776   match(Set dst (VectorCastD2X src));
 7777   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7778   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7779   ins_encode %{
 7780     int vlen_enc = vector_length_encoding(this, $src);
 7781     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7782     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7783                               ExternalAddress(vector_float_signflip());
 7784     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7785                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7786   %}
 7787   ins_pipe( pipe_slow );
 7788 %}
 7789 
 7790 instruct castDtoX_reg_avx10(vec dst, vec src) %{
 7791   predicate(VM_Version::supports_avx10_2() &&
 7792             is_integral_type(Matcher::vector_element_basic_type(n)));
 7793   match(Set dst (VectorCastD2X src));
 7794   format %{ "vector_cast_d2x_avx10 $dst, $src\t!" %}
 7795   ins_encode %{
 7796     int vlen_enc = vector_length_encoding(this, $src);
 7797     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7798     __ vector_castD2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7799   %}
 7800   ins_pipe( pipe_slow );
 7801 %}
 7802 
 7803 instruct castDtoX_mem_avx10(vec dst, memory src) %{
 7804   predicate(VM_Version::supports_avx10_2() &&
 7805             is_integral_type(Matcher::vector_element_basic_type(n)));
 7806   match(Set dst (VectorCastD2X (LoadVector src)));
 7807   format %{ "vector_cast_d2x_avx10 $dst, $src\t!" %}
 7808   ins_encode %{
 7809     int vlen = Matcher::vector_length(this);
 7810     int vlen_enc = vector_length_encoding(vlen * sizeof(jdouble));
 7811     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7812     __ vector_castD2X_avx10(to_elem_bt, $dst$$XMMRegister, $src$$Address, vlen_enc);
 7813   %}
 7814   ins_pipe( pipe_slow );
 7815 %}
 7816 
 7817 instruct vucast(vec dst, vec src) %{
 7818   match(Set dst (VectorUCastB2X src));
 7819   match(Set dst (VectorUCastS2X src));
 7820   match(Set dst (VectorUCastI2X src));
 7821   format %{ "vector_ucast $dst,$src\t!" %}
 7822   ins_encode %{
 7823     assert(UseAVX > 0, "required");
 7824 
 7825     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7826     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7827     int vlen_enc = vector_length_encoding(this);
 7828     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7829   %}
 7830   ins_pipe( pipe_slow );
 7831 %}
 7832 
 7833 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7834   predicate(!VM_Version::supports_avx512vl() &&
 7835             Matcher::vector_length_in_bytes(n) < 64 &&
 7836             Matcher::vector_element_basic_type(n) == T_INT);
 7837   match(Set dst (RoundVF src));
 7838   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7839   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7840   ins_encode %{
 7841     int vlen_enc = vector_length_encoding(this);
 7842     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7843     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7844                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7845                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7846   %}
 7847   ins_pipe( pipe_slow );
 7848 %}
 7849 
 7850 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7851   predicate((VM_Version::supports_avx512vl() ||
 7852              Matcher::vector_length_in_bytes(n) == 64) &&
 7853              Matcher::vector_element_basic_type(n) == T_INT);
 7854   match(Set dst (RoundVF src));
 7855   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7856   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7857   ins_encode %{
 7858     int vlen_enc = vector_length_encoding(this);
 7859     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7860     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7861                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7862                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7863   %}
 7864   ins_pipe( pipe_slow );
 7865 %}
 7866 
 7867 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7868   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7869   match(Set dst (RoundVD src));
 7870   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7871   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7872   ins_encode %{
 7873     int vlen_enc = vector_length_encoding(this);
 7874     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7875     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7876                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7877                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7878   %}
 7879   ins_pipe( pipe_slow );
 7880 %}
 7881 
 7882 // --------------------------------- VectorMaskCmp --------------------------------------
 7883 
 7884 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7885   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7886             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7887             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7888             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7889   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7890   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7891   ins_encode %{
 7892     int vlen_enc = vector_length_encoding(this, $src1);
 7893     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7894     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7895       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7896     } else {
 7897       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7898     }
 7899   %}
 7900   ins_pipe( pipe_slow );
 7901 %}
 7902 
 7903 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7904   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7905             n->bottom_type()->isa_vectmask() == nullptr &&
 7906             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7907   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7908   effect(TEMP ktmp);
 7909   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7910   ins_encode %{
 7911     int vlen_enc = Assembler::AVX_512bit;
 7912     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7913     KRegister mask = k0; // The comparison itself is not being masked.
 7914     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7915       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7916       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7917     } else {
 7918       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7919       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7920     }
 7921   %}
 7922   ins_pipe( pipe_slow );
 7923 %}
 7924 
 7925 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7926   predicate(n->bottom_type()->isa_vectmask() &&
 7927             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7928   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7929   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7930   ins_encode %{
 7931     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7932     int vlen_enc = vector_length_encoding(this, $src1);
 7933     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7934     KRegister mask = k0; // The comparison itself is not being masked.
 7935     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7936       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7937     } else {
 7938       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7939     }
 7940   %}
 7941   ins_pipe( pipe_slow );
 7942 %}
 7943 
 7944 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7945   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7946             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7947             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7948             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7949             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7950             (n->in(2)->get_int() == BoolTest::eq ||
 7951              n->in(2)->get_int() == BoolTest::lt ||
 7952              n->in(2)->get_int() == BoolTest::gt)); // cond
 7953   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7954   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7955   ins_encode %{
 7956     int vlen_enc = vector_length_encoding(this, $src1);
 7957     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7958     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7959     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7960   %}
 7961   ins_pipe( pipe_slow );
 7962 %}
 7963 
 7964 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7965   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7966             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7967             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7968             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7969             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7970             (n->in(2)->get_int() == BoolTest::ne ||
 7971              n->in(2)->get_int() == BoolTest::le ||
 7972              n->in(2)->get_int() == BoolTest::ge)); // cond
 7973   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7974   effect(TEMP dst, TEMP xtmp);
 7975   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7976   ins_encode %{
 7977     int vlen_enc = vector_length_encoding(this, $src1);
 7978     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7979     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7980     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7981   %}
 7982   ins_pipe( pipe_slow );
 7983 %}
 7984 
 7985 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7986   predicate(n->bottom_type()->isa_vectmask() == nullptr &&
 7987             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7988             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7989             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7990             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7991   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7992   effect(TEMP dst, TEMP xtmp);
 7993   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7994   ins_encode %{
 7995     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7996     int vlen_enc = vector_length_encoding(this, $src1);
 7997     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7998     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7999 
 8000     if (vlen_enc == Assembler::AVX_128bit) {
 8001       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8002     } else {
 8003       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 8004     }
 8005     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8006     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8007     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 8008   %}
 8009   ins_pipe( pipe_slow );
 8010 %}
 8011 
 8012 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 8013   predicate((n->bottom_type()->isa_vectmask() == nullptr &&
 8014              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 8015              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8016   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8017   effect(TEMP ktmp);
 8018   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 8019   ins_encode %{
 8020     assert(UseAVX > 2, "required");
 8021 
 8022     int vlen_enc = vector_length_encoding(this, $src1);
 8023     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8024     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8025     KRegister mask = k0; // The comparison itself is not being masked.
 8026     bool merge = false;
 8027     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8028 
 8029     switch (src1_elem_bt) {
 8030       case T_INT: {
 8031         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8032         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8033         break;
 8034       }
 8035       case T_LONG: {
 8036         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8037         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 8038         break;
 8039       }
 8040       default: assert(false, "%s", type2name(src1_elem_bt));
 8041     }
 8042   %}
 8043   ins_pipe( pipe_slow );
 8044 %}
 8045 
 8046 
 8047 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 8048   predicate(n->bottom_type()->isa_vectmask() &&
 8049             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 8050   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 8051   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 8052   ins_encode %{
 8053     assert(UseAVX > 2, "required");
 8054     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 8055 
 8056     int vlen_enc = vector_length_encoding(this, $src1);
 8057     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 8058     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 8059     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8060 
 8061     // Comparison i
 8062     switch (src1_elem_bt) {
 8063       case T_BYTE: {
 8064         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8065         break;
 8066       }
 8067       case T_SHORT: {
 8068         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8069         break;
 8070       }
 8071       case T_INT: {
 8072         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8073         break;
 8074       }
 8075       case T_LONG: {
 8076         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 8077         break;
 8078       }
 8079       default: assert(false, "%s", type2name(src1_elem_bt));
 8080     }
 8081   %}
 8082   ins_pipe( pipe_slow );
 8083 %}
 8084 
 8085 // Extract
 8086 
 8087 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 8088   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 8089   match(Set dst (ExtractI src idx));
 8090   match(Set dst (ExtractS src idx));
 8091   match(Set dst (ExtractB src idx));
 8092   format %{ "extractI $dst,$src,$idx\t!" %}
 8093   ins_encode %{
 8094     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8095 
 8096     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8097     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8098   %}
 8099   ins_pipe( pipe_slow );
 8100 %}
 8101 
 8102 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 8103   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 8104             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 8105   match(Set dst (ExtractI src idx));
 8106   match(Set dst (ExtractS src idx));
 8107   match(Set dst (ExtractB src idx));
 8108   effect(TEMP vtmp);
 8109   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8110   ins_encode %{
 8111     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8112 
 8113     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 8114     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8115     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 8116   %}
 8117   ins_pipe( pipe_slow );
 8118 %}
 8119 
 8120 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 8121   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 8122   match(Set dst (ExtractL src idx));
 8123   format %{ "extractL $dst,$src,$idx\t!" %}
 8124   ins_encode %{
 8125     assert(UseSSE >= 4, "required");
 8126     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8127 
 8128     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 8129   %}
 8130   ins_pipe( pipe_slow );
 8131 %}
 8132 
 8133 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 8134   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8135             Matcher::vector_length(n->in(1)) == 8);  // src
 8136   match(Set dst (ExtractL src idx));
 8137   effect(TEMP vtmp);
 8138   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8139   ins_encode %{
 8140     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8141 
 8142     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8143     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 8144   %}
 8145   ins_pipe( pipe_slow );
 8146 %}
 8147 
 8148 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8149   predicate(Matcher::vector_length(n->in(1)) <= 4);
 8150   match(Set dst (ExtractF src idx));
 8151   effect(TEMP dst, TEMP vtmp);
 8152   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8153   ins_encode %{
 8154     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8155 
 8156     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 8157   %}
 8158   ins_pipe( pipe_slow );
 8159 %}
 8160 
 8161 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 8162   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 8163             Matcher::vector_length(n->in(1)/*src*/) == 16);
 8164   match(Set dst (ExtractF src idx));
 8165   effect(TEMP vtmp);
 8166   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8167   ins_encode %{
 8168     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8169 
 8170     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8171     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8172   %}
 8173   ins_pipe( pipe_slow );
 8174 %}
 8175 
 8176 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 8177   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 8178   match(Set dst (ExtractD src idx));
 8179   format %{ "extractD $dst,$src,$idx\t!" %}
 8180   ins_encode %{
 8181     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8182 
 8183     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8184   %}
 8185   ins_pipe( pipe_slow );
 8186 %}
 8187 
 8188 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 8189   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 8190             Matcher::vector_length(n->in(1)) == 8);  // src
 8191   match(Set dst (ExtractD src idx));
 8192   effect(TEMP vtmp);
 8193   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 8194   ins_encode %{
 8195     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 8196 
 8197     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 8198     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 8199   %}
 8200   ins_pipe( pipe_slow );
 8201 %}
 8202 
 8203 // --------------------------------- Vector Blend --------------------------------------
 8204 
 8205 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 8206   predicate(UseAVX == 0);
 8207   match(Set dst (VectorBlend (Binary dst src) mask));
 8208   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 8209   effect(TEMP tmp);
 8210   ins_encode %{
 8211     assert(UseSSE >= 4, "required");
 8212 
 8213     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 8214       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 8215     }
 8216     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 8217   %}
 8218   ins_pipe( pipe_slow );
 8219 %}
 8220 
 8221 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8222   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8223             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8224             Matcher::vector_length_in_bytes(n) <= 32 &&
 8225             is_integral_type(Matcher::vector_element_basic_type(n)));
 8226   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8227   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8228   ins_encode %{
 8229     int vlen_enc = vector_length_encoding(this);
 8230     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8231   %}
 8232   ins_pipe( pipe_slow );
 8233 %}
 8234 
 8235 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 8236   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 8237             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8238             Matcher::vector_length_in_bytes(n) <= 32 &&
 8239             !is_integral_type(Matcher::vector_element_basic_type(n)));
 8240   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8241   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 8242   ins_encode %{
 8243     int vlen_enc = vector_length_encoding(this);
 8244     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8245   %}
 8246   ins_pipe( pipe_slow );
 8247 %}
 8248 
 8249 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 8250   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 8251             n->in(2)->bottom_type()->isa_vectmask() == nullptr &&
 8252             Matcher::vector_length_in_bytes(n) <= 32);
 8253   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8254   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 8255   effect(TEMP vtmp, TEMP dst);
 8256   ins_encode %{
 8257     int vlen_enc = vector_length_encoding(this);
 8258     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 8259     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8260     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 8261   %}
 8262   ins_pipe( pipe_slow );
 8263 %}
 8264 
 8265 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 8266   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 8267             n->in(2)->bottom_type()->isa_vectmask() == nullptr);
 8268   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8269   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8270   effect(TEMP ktmp);
 8271   ins_encode %{
 8272      int vlen_enc = Assembler::AVX_512bit;
 8273      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8274     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 8275     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8276   %}
 8277   ins_pipe( pipe_slow );
 8278 %}
 8279 
 8280 
 8281 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 8282   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 8283             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 8284              VM_Version::supports_avx512bw()));
 8285   match(Set dst (VectorBlend (Binary src1 src2) mask));
 8286   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 8287   ins_encode %{
 8288     int vlen_enc = vector_length_encoding(this);
 8289     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8290     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 8291   %}
 8292   ins_pipe( pipe_slow );
 8293 %}
 8294 
 8295 // --------------------------------- ABS --------------------------------------
 8296 // a = |a|
 8297 instruct vabsB_reg(vec dst, vec src) %{
 8298   match(Set dst (AbsVB  src));
 8299   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 8300   ins_encode %{
 8301     uint vlen = Matcher::vector_length(this);
 8302     if (vlen <= 16) {
 8303       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8304     } else {
 8305       int vlen_enc = vector_length_encoding(this);
 8306       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8307     }
 8308   %}
 8309   ins_pipe( pipe_slow );
 8310 %}
 8311 
 8312 instruct vabsS_reg(vec dst, vec src) %{
 8313   match(Set dst (AbsVS  src));
 8314   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 8315   ins_encode %{
 8316     uint vlen = Matcher::vector_length(this);
 8317     if (vlen <= 8) {
 8318       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8319     } else {
 8320       int vlen_enc = vector_length_encoding(this);
 8321       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8322     }
 8323   %}
 8324   ins_pipe( pipe_slow );
 8325 %}
 8326 
 8327 instruct vabsI_reg(vec dst, vec src) %{
 8328   match(Set dst (AbsVI  src));
 8329   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8330   ins_encode %{
 8331     uint vlen = Matcher::vector_length(this);
 8332     if (vlen <= 4) {
 8333       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8334     } else {
 8335       int vlen_enc = vector_length_encoding(this);
 8336       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8337     }
 8338   %}
 8339   ins_pipe( pipe_slow );
 8340 %}
 8341 
 8342 instruct vabsL_reg(vec dst, vec src) %{
 8343   match(Set dst (AbsVL  src));
 8344   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8345   ins_encode %{
 8346     assert(UseAVX > 2, "required");
 8347     int vlen_enc = vector_length_encoding(this);
 8348     if (!VM_Version::supports_avx512vl()) {
 8349       vlen_enc = Assembler::AVX_512bit;
 8350     }
 8351     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8352   %}
 8353   ins_pipe( pipe_slow );
 8354 %}
 8355 
 8356 // --------------------------------- ABSNEG --------------------------------------
 8357 
 8358 instruct vabsnegF(vec dst, vec src) %{
 8359   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8360   match(Set dst (AbsVF src));
 8361   match(Set dst (NegVF src));
 8362   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8363   ins_cost(150);
 8364   ins_encode %{
 8365     int opcode = this->ideal_Opcode();
 8366     int vlen = Matcher::vector_length(this);
 8367     if (vlen == 2) {
 8368       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8369     } else {
 8370       assert(vlen == 8 || vlen == 16, "required");
 8371       int vlen_enc = vector_length_encoding(this);
 8372       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8373     }
 8374   %}
 8375   ins_pipe( pipe_slow );
 8376 %}
 8377 
 8378 instruct vabsneg4F(vec dst) %{
 8379   predicate(Matcher::vector_length(n) == 4);
 8380   match(Set dst (AbsVF dst));
 8381   match(Set dst (NegVF dst));
 8382   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8383   ins_cost(150);
 8384   ins_encode %{
 8385     int opcode = this->ideal_Opcode();
 8386     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8387   %}
 8388   ins_pipe( pipe_slow );
 8389 %}
 8390 
 8391 instruct vabsnegD(vec dst, vec src) %{
 8392   match(Set dst (AbsVD  src));
 8393   match(Set dst (NegVD  src));
 8394   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8395   ins_encode %{
 8396     int opcode = this->ideal_Opcode();
 8397     uint vlen = Matcher::vector_length(this);
 8398     if (vlen == 2) {
 8399       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8400     } else {
 8401       int vlen_enc = vector_length_encoding(this);
 8402       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8403     }
 8404   %}
 8405   ins_pipe( pipe_slow );
 8406 %}
 8407 
 8408 //------------------------------------- VectorTest --------------------------------------------
 8409 
 8410 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8411   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8412   match(Set cr (VectorTest src1 src2));
 8413   effect(TEMP vtmp);
 8414   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8415   ins_encode %{
 8416     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8417     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8418     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8419   %}
 8420   ins_pipe( pipe_slow );
 8421 %}
 8422 
 8423 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8424   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8425   match(Set cr (VectorTest src1 src2));
 8426   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8427   ins_encode %{
 8428     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8429     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8430     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8431   %}
 8432   ins_pipe( pipe_slow );
 8433 %}
 8434 
 8435 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8436   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8437              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8438             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8439   match(Set cr (VectorTest src1 src2));
 8440   effect(TEMP tmp);
 8441   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8442   ins_encode %{
 8443     uint masklen = Matcher::vector_length(this, $src1);
 8444     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8445     __ andl($tmp$$Register, (1 << masklen) - 1);
 8446     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8447   %}
 8448   ins_pipe( pipe_slow );
 8449 %}
 8450 
 8451 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8452   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8453              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8454             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8455   match(Set cr (VectorTest src1 src2));
 8456   effect(TEMP tmp);
 8457   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8458   ins_encode %{
 8459     uint masklen = Matcher::vector_length(this, $src1);
 8460     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8461     __ andl($tmp$$Register, (1 << masklen) - 1);
 8462   %}
 8463   ins_pipe( pipe_slow );
 8464 %}
 8465 
 8466 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8467   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8468             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8469   match(Set cr (VectorTest src1 src2));
 8470   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8471   ins_encode %{
 8472     uint masklen = Matcher::vector_length(this, $src1);
 8473     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8474   %}
 8475   ins_pipe( pipe_slow );
 8476 %}
 8477 
 8478 //------------------------------------- LoadMask --------------------------------------------
 8479 
 8480 instruct loadMask(legVec dst, legVec src) %{
 8481   predicate(n->bottom_type()->isa_vectmask() == nullptr && !VM_Version::supports_avx512vlbw());
 8482   match(Set dst (VectorLoadMask src));
 8483   effect(TEMP dst);
 8484   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8485   ins_encode %{
 8486     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8487     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8488     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8489   %}
 8490   ins_pipe( pipe_slow );
 8491 %}
 8492 
 8493 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8494   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8495   match(Set dst (VectorLoadMask src));
 8496   effect(TEMP xtmp);
 8497   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8498   ins_encode %{
 8499     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8500                         true, Assembler::AVX_512bit);
 8501   %}
 8502   ins_pipe( pipe_slow );
 8503 %}
 8504 
 8505 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8506   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8507   match(Set dst (VectorLoadMask src));
 8508   effect(TEMP xtmp);
 8509   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8510   ins_encode %{
 8511     int vlen_enc = vector_length_encoding(in(1));
 8512     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8513                         false, vlen_enc);
 8514   %}
 8515   ins_pipe( pipe_slow );
 8516 %}
 8517 
 8518 //------------------------------------- StoreMask --------------------------------------------
 8519 
 8520 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8521   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8522   match(Set dst (VectorStoreMask src size));
 8523   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8524   ins_encode %{
 8525     int vlen = Matcher::vector_length(this);
 8526     if (vlen <= 16 && UseAVX <= 2) {
 8527       assert(UseSSE >= 3, "required");
 8528       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8529     } else {
 8530       assert(UseAVX > 0, "required");
 8531       int src_vlen_enc = vector_length_encoding(this, $src);
 8532       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8533     }
 8534   %}
 8535   ins_pipe( pipe_slow );
 8536 %}
 8537 
 8538 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8539   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8540   match(Set dst (VectorStoreMask src size));
 8541   effect(TEMP_DEF dst, TEMP xtmp);
 8542   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8543   ins_encode %{
 8544     int vlen_enc = Assembler::AVX_128bit;
 8545     int vlen = Matcher::vector_length(this);
 8546     if (vlen <= 8) {
 8547       assert(UseSSE >= 3, "required");
 8548       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8549       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8550       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8551     } else {
 8552       assert(UseAVX > 0, "required");
 8553       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8554       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8555       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8556     }
 8557   %}
 8558   ins_pipe( pipe_slow );
 8559 %}
 8560 
 8561 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8562   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8563   match(Set dst (VectorStoreMask src size));
 8564   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8565   effect(TEMP_DEF dst, TEMP xtmp);
 8566   ins_encode %{
 8567     int vlen_enc = Assembler::AVX_128bit;
 8568     int vlen = Matcher::vector_length(this);
 8569     if (vlen <= 4) {
 8570       assert(UseSSE >= 3, "required");
 8571       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8572       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8573       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8574       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8575     } else {
 8576       assert(UseAVX > 0, "required");
 8577       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8578       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8579       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8580       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8581       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8582     }
 8583   %}
 8584   ins_pipe( pipe_slow );
 8585 %}
 8586 
 8587 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8588   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8589   match(Set dst (VectorStoreMask src size));
 8590   effect(TEMP_DEF dst, TEMP xtmp);
 8591   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8592   ins_encode %{
 8593     assert(UseSSE >= 3, "required");
 8594     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8595     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8596     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8597     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8598     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8599   %}
 8600   ins_pipe( pipe_slow );
 8601 %}
 8602 
 8603 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8604   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8605   match(Set dst (VectorStoreMask src size));
 8606   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8607   effect(TEMP_DEF dst, TEMP vtmp);
 8608   ins_encode %{
 8609     int vlen_enc = Assembler::AVX_128bit;
 8610     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8611     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8612     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8613     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8614     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8615     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8616     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8617   %}
 8618   ins_pipe( pipe_slow );
 8619 %}
 8620 
 8621 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8622   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8623   match(Set dst (VectorStoreMask src size));
 8624   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8625   ins_encode %{
 8626     int src_vlen_enc = vector_length_encoding(this, $src);
 8627     int dst_vlen_enc = vector_length_encoding(this);
 8628     if (!VM_Version::supports_avx512vl()) {
 8629       src_vlen_enc = Assembler::AVX_512bit;
 8630     }
 8631     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8632     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8633   %}
 8634   ins_pipe( pipe_slow );
 8635 %}
 8636 
 8637 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8638   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 8639   match(Set dst (VectorStoreMask src size));
 8640   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8641   ins_encode %{
 8642     int src_vlen_enc = vector_length_encoding(this, $src);
 8643     int dst_vlen_enc = vector_length_encoding(this);
 8644     if (!VM_Version::supports_avx512vl()) {
 8645       src_vlen_enc = Assembler::AVX_512bit;
 8646     }
 8647     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8648     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8649   %}
 8650   ins_pipe( pipe_slow );
 8651 %}
 8652 
 8653 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8654   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8655   match(Set dst (VectorStoreMask mask size));
 8656   effect(TEMP_DEF dst);
 8657   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8658   ins_encode %{
 8659     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8660     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8661                  false, Assembler::AVX_512bit, noreg);
 8662     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8663   %}
 8664   ins_pipe( pipe_slow );
 8665 %}
 8666 
 8667 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8668   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8669   match(Set dst (VectorStoreMask mask size));
 8670   effect(TEMP_DEF dst);
 8671   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8672   ins_encode %{
 8673     int dst_vlen_enc = vector_length_encoding(this);
 8674     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8675     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8676   %}
 8677   ins_pipe( pipe_slow );
 8678 %}
 8679 
 8680 instruct vmaskcast_evex(kReg dst) %{
 8681   match(Set dst (VectorMaskCast dst));
 8682   ins_cost(0);
 8683   format %{ "vector_mask_cast $dst" %}
 8684   ins_encode %{
 8685     // empty
 8686   %}
 8687   ins_pipe(empty);
 8688 %}
 8689 
 8690 instruct vmaskcast(vec dst) %{
 8691   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8692   match(Set dst (VectorMaskCast dst));
 8693   ins_cost(0);
 8694   format %{ "vector_mask_cast $dst" %}
 8695   ins_encode %{
 8696     // empty
 8697   %}
 8698   ins_pipe(empty);
 8699 %}
 8700 
 8701 instruct vmaskcast_avx(vec dst, vec src) %{
 8702   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8703   match(Set dst (VectorMaskCast src));
 8704   format %{ "vector_mask_cast $dst, $src" %}
 8705   ins_encode %{
 8706     int vlen = Matcher::vector_length(this);
 8707     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8708     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8709     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8710   %}
 8711   ins_pipe(pipe_slow);
 8712 %}
 8713 
 8714 //-------------------------------- Load Iota Indices ----------------------------------
 8715 
 8716 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8717   match(Set dst (VectorLoadConst src));
 8718   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8719   ins_encode %{
 8720      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8721      BasicType bt = Matcher::vector_element_basic_type(this);
 8722      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8723   %}
 8724   ins_pipe( pipe_slow );
 8725 %}
 8726 
 8727 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8728   match(Set dst (PopulateIndex src1 src2));
 8729   effect(TEMP dst, TEMP vtmp);
 8730   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8731   ins_encode %{
 8732      assert($src2$$constant == 1, "required");
 8733      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8734      int vlen_enc = vector_length_encoding(this);
 8735      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8736      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8737      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8738      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8739   %}
 8740   ins_pipe( pipe_slow );
 8741 %}
 8742 
 8743 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8744   match(Set dst (PopulateIndex src1 src2));
 8745   effect(TEMP dst, TEMP vtmp);
 8746   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8747   ins_encode %{
 8748      assert($src2$$constant == 1, "required");
 8749      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8750      int vlen_enc = vector_length_encoding(this);
 8751      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8752      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8753      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8754      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8755   %}
 8756   ins_pipe( pipe_slow );
 8757 %}
 8758 
 8759 //-------------------------------- Rearrange ----------------------------------
 8760 
 8761 // LoadShuffle/Rearrange for Byte
 8762 instruct rearrangeB(vec dst, vec shuffle) %{
 8763   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8764             Matcher::vector_length(n) < 32);
 8765   match(Set dst (VectorRearrange dst shuffle));
 8766   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8767   ins_encode %{
 8768     assert(UseSSE >= 4, "required");
 8769     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8770   %}
 8771   ins_pipe( pipe_slow );
 8772 %}
 8773 
 8774 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8775   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8776             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8777   match(Set dst (VectorRearrange src shuffle));
 8778   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8779   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8780   ins_encode %{
 8781     assert(UseAVX >= 2, "required");
 8782     // Swap src into vtmp1
 8783     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8784     // Shuffle swapped src to get entries from other 128 bit lane
 8785     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8786     // Shuffle original src to get entries from self 128 bit lane
 8787     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8788     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8789     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8790     // Perform the blend
 8791     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8792   %}
 8793   ins_pipe( pipe_slow );
 8794 %}
 8795 
 8796 
 8797 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8798   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8799             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8800   match(Set dst (VectorRearrange src shuffle));
 8801   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8802   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8803   ins_encode %{
 8804     int vlen_enc = vector_length_encoding(this);
 8805     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8806                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8807                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8808   %}
 8809   ins_pipe( pipe_slow );
 8810 %}
 8811 
 8812 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8813   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8814             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8815   match(Set dst (VectorRearrange src shuffle));
 8816   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8817   ins_encode %{
 8818     int vlen_enc = vector_length_encoding(this);
 8819     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8820   %}
 8821   ins_pipe( pipe_slow );
 8822 %}
 8823 
 8824 // LoadShuffle/Rearrange for Short
 8825 
 8826 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8827   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8828             !VM_Version::supports_avx512bw());
 8829   match(Set dst (VectorLoadShuffle src));
 8830   effect(TEMP dst, TEMP vtmp);
 8831   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8832   ins_encode %{
 8833     // Create a byte shuffle mask from short shuffle mask
 8834     // only byte shuffle instruction available on these platforms
 8835     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8836     if (UseAVX == 0) {
 8837       assert(vlen_in_bytes <= 16, "required");
 8838       // Multiply each shuffle by two to get byte index
 8839       __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8840       __ psllw($vtmp$$XMMRegister, 1);
 8841 
 8842       // Duplicate to create 2 copies of byte index
 8843       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8844       __ psllw($dst$$XMMRegister, 8);
 8845       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8846 
 8847       // Add one to get alternate byte index
 8848       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8849       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8850     } else {
 8851       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8852       int vlen_enc = vector_length_encoding(this);
 8853       // Multiply each shuffle by two to get byte index
 8854       __ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8855 
 8856       // Duplicate to create 2 copies of byte index
 8857       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8858       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8859 
 8860       // Add one to get alternate byte index
 8861       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8862     }
 8863   %}
 8864   ins_pipe( pipe_slow );
 8865 %}
 8866 
 8867 instruct rearrangeS(vec dst, vec shuffle) %{
 8868   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8869             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8870   match(Set dst (VectorRearrange dst shuffle));
 8871   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8872   ins_encode %{
 8873     assert(UseSSE >= 4, "required");
 8874     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8875   %}
 8876   ins_pipe( pipe_slow );
 8877 %}
 8878 
 8879 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8880   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8881             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8882   match(Set dst (VectorRearrange src shuffle));
 8883   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8884   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8885   ins_encode %{
 8886     assert(UseAVX >= 2, "required");
 8887     // Swap src into vtmp1
 8888     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8889     // Shuffle swapped src to get entries from other 128 bit lane
 8890     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8891     // Shuffle original src to get entries from self 128 bit lane
 8892     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8893     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8894     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8895     // Perform the blend
 8896     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8897   %}
 8898   ins_pipe( pipe_slow );
 8899 %}
 8900 
 8901 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8902   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8903             VM_Version::supports_avx512bw());
 8904   match(Set dst (VectorRearrange src shuffle));
 8905   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8906   ins_encode %{
 8907     int vlen_enc = vector_length_encoding(this);
 8908     if (!VM_Version::supports_avx512vl()) {
 8909       vlen_enc = Assembler::AVX_512bit;
 8910     }
 8911     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8912   %}
 8913   ins_pipe( pipe_slow );
 8914 %}
 8915 
 8916 // LoadShuffle/Rearrange for Integer and Float
 8917 
 8918 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8919   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8920             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8921   match(Set dst (VectorLoadShuffle src));
 8922   effect(TEMP dst, TEMP vtmp);
 8923   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8924   ins_encode %{
 8925     assert(UseSSE >= 4, "required");
 8926 
 8927     // Create a byte shuffle mask from int shuffle mask
 8928     // only byte shuffle instruction available on these platforms
 8929 
 8930     // Duplicate and multiply each shuffle by 4
 8931     __ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
 8932     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8933     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8934     __ psllw($vtmp$$XMMRegister, 2);
 8935 
 8936     // Duplicate again to create 4 copies of byte index
 8937     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8938     __ psllw($dst$$XMMRegister, 8);
 8939     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8940 
 8941     // Add 3,2,1,0 to get alternate byte index
 8942     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8943     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8944   %}
 8945   ins_pipe( pipe_slow );
 8946 %}
 8947 
 8948 instruct rearrangeI(vec dst, vec shuffle) %{
 8949   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8950             UseAVX == 0);
 8951   match(Set dst (VectorRearrange dst shuffle));
 8952   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8953   ins_encode %{
 8954     assert(UseSSE >= 4, "required");
 8955     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8956   %}
 8957   ins_pipe( pipe_slow );
 8958 %}
 8959 
 8960 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8961   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8962             UseAVX > 0);
 8963   match(Set dst (VectorRearrange src shuffle));
 8964   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8965   ins_encode %{
 8966     int vlen_enc = vector_length_encoding(this);
 8967     BasicType bt = Matcher::vector_element_basic_type(this);
 8968     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8969   %}
 8970   ins_pipe( pipe_slow );
 8971 %}
 8972 
 8973 // LoadShuffle/Rearrange for Long and Double
 8974 
 8975 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8976   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8977             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8978   match(Set dst (VectorLoadShuffle src));
 8979   effect(TEMP dst, TEMP vtmp);
 8980   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8981   ins_encode %{
 8982     assert(UseAVX >= 2, "required");
 8983 
 8984     int vlen_enc = vector_length_encoding(this);
 8985     // Create a double word shuffle mask from long shuffle mask
 8986     // only double word shuffle instruction available on these platforms
 8987 
 8988     // Multiply each shuffle by two to get double word index
 8989     __ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
 8990 
 8991     // Duplicate each double word shuffle
 8992     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8993     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8994 
 8995     // Add one to get alternate double word index
 8996     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8997   %}
 8998   ins_pipe( pipe_slow );
 8999 %}
 9000 
 9001 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 9002   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9003             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 9004   match(Set dst (VectorRearrange src shuffle));
 9005   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9006   ins_encode %{
 9007     assert(UseAVX >= 2, "required");
 9008 
 9009     int vlen_enc = vector_length_encoding(this);
 9010     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9011   %}
 9012   ins_pipe( pipe_slow );
 9013 %}
 9014 
 9015 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 9016   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 9017             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 9018   match(Set dst (VectorRearrange src shuffle));
 9019   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 9020   ins_encode %{
 9021     assert(UseAVX > 2, "required");
 9022 
 9023     int vlen_enc = vector_length_encoding(this);
 9024     if (vlen_enc == Assembler::AVX_128bit) {
 9025       vlen_enc = Assembler::AVX_256bit;
 9026     }
 9027     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9028   %}
 9029   ins_pipe( pipe_slow );
 9030 %}
 9031 
 9032 // --------------------------------- FMA --------------------------------------
 9033 // a * b + c
 9034 
 9035 instruct vfmaF_reg(vec a, vec b, vec c) %{
 9036   match(Set c (FmaVF  c (Binary a b)));
 9037   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9038   ins_cost(150);
 9039   ins_encode %{
 9040     assert(UseFMA, "not enabled");
 9041     int vlen_enc = vector_length_encoding(this);
 9042     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9043   %}
 9044   ins_pipe( pipe_slow );
 9045 %}
 9046 
 9047 instruct vfmaF_mem(vec a, memory b, vec c) %{
 9048   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9049   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 9050   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 9051   ins_cost(150);
 9052   ins_encode %{
 9053     assert(UseFMA, "not enabled");
 9054     int vlen_enc = vector_length_encoding(this);
 9055     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9056   %}
 9057   ins_pipe( pipe_slow );
 9058 %}
 9059 
 9060 instruct vfmaD_reg(vec a, vec b, vec c) %{
 9061   match(Set c (FmaVD  c (Binary a b)));
 9062   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9063   ins_cost(150);
 9064   ins_encode %{
 9065     assert(UseFMA, "not enabled");
 9066     int vlen_enc = vector_length_encoding(this);
 9067     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 9068   %}
 9069   ins_pipe( pipe_slow );
 9070 %}
 9071 
 9072 instruct vfmaD_mem(vec a, memory b, vec c) %{
 9073   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 9074   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 9075   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 9076   ins_cost(150);
 9077   ins_encode %{
 9078     assert(UseFMA, "not enabled");
 9079     int vlen_enc = vector_length_encoding(this);
 9080     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 9081   %}
 9082   ins_pipe( pipe_slow );
 9083 %}
 9084 
 9085 // --------------------------------- Vector Multiply Add --------------------------------------
 9086 
 9087 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 9088   predicate(UseAVX == 0);
 9089   match(Set dst (MulAddVS2VI dst src1));
 9090   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 9091   ins_encode %{
 9092     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 9093   %}
 9094   ins_pipe( pipe_slow );
 9095 %}
 9096 
 9097 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 9098   predicate(UseAVX > 0);
 9099   match(Set dst (MulAddVS2VI src1 src2));
 9100   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 9101   ins_encode %{
 9102     int vlen_enc = vector_length_encoding(this);
 9103     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9104   %}
 9105   ins_pipe( pipe_slow );
 9106 %}
 9107 
 9108 // --------------------------------- Vector Multiply Add Add ----------------------------------
 9109 
 9110 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 9111   predicate(VM_Version::supports_avx512_vnni());
 9112   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 9113   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 9114   ins_encode %{
 9115     assert(UseAVX > 2, "required");
 9116     int vlen_enc = vector_length_encoding(this);
 9117     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 9118   %}
 9119   ins_pipe( pipe_slow );
 9120   ins_cost(10);
 9121 %}
 9122 
 9123 // --------------------------------- PopCount --------------------------------------
 9124 
 9125 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 9126   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9127   match(Set dst (PopCountVI src));
 9128   match(Set dst (PopCountVL src));
 9129   format %{ "vector_popcount_integral $dst, $src" %}
 9130   ins_encode %{
 9131     int opcode = this->ideal_Opcode();
 9132     int vlen_enc = vector_length_encoding(this, $src);
 9133     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9134     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 9135   %}
 9136   ins_pipe( pipe_slow );
 9137 %}
 9138 
 9139 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9140   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9141   match(Set dst (PopCountVI src mask));
 9142   match(Set dst (PopCountVL src mask));
 9143   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 9144   ins_encode %{
 9145     int vlen_enc = vector_length_encoding(this, $src);
 9146     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9147     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9148     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 9149   %}
 9150   ins_pipe( pipe_slow );
 9151 %}
 9152 
 9153 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 9154   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 9155   match(Set dst (PopCountVI src));
 9156   match(Set dst (PopCountVL src));
 9157   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9158   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 9159   ins_encode %{
 9160     int opcode = this->ideal_Opcode();
 9161     int vlen_enc = vector_length_encoding(this, $src);
 9162     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9163     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9164                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 9165   %}
 9166   ins_pipe( pipe_slow );
 9167 %}
 9168 
 9169 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 9170 
 9171 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 9172   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9173                                               Matcher::vector_length_in_bytes(n->in(1))));
 9174   match(Set dst (CountTrailingZerosV src));
 9175   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9176   ins_cost(400);
 9177   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 9178   ins_encode %{
 9179     int vlen_enc = vector_length_encoding(this, $src);
 9180     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9181     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9182                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9183   %}
 9184   ins_pipe( pipe_slow );
 9185 %}
 9186 
 9187 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9188   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9189             VM_Version::supports_avx512cd() &&
 9190             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9191   match(Set dst (CountTrailingZerosV src));
 9192   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9193   ins_cost(400);
 9194   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9195   ins_encode %{
 9196     int vlen_enc = vector_length_encoding(this, $src);
 9197     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9198     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9199                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9200   %}
 9201   ins_pipe( pipe_slow );
 9202 %}
 9203 
 9204 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9205   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9206   match(Set dst (CountTrailingZerosV src));
 9207   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9208   ins_cost(400);
 9209   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9210   ins_encode %{
 9211     int vlen_enc = vector_length_encoding(this, $src);
 9212     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9213     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9214                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9215                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9216   %}
 9217   ins_pipe( pipe_slow );
 9218 %}
 9219 
 9220 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9221   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9222   match(Set dst (CountTrailingZerosV src));
 9223   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9224   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9225   ins_encode %{
 9226     int vlen_enc = vector_length_encoding(this, $src);
 9227     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9228     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9229                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9230   %}
 9231   ins_pipe( pipe_slow );
 9232 %}
 9233 
 9234 
 9235 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9236 
 9237 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9238   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9239   effect(TEMP dst);
 9240   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9241   ins_encode %{
 9242     int vector_len = vector_length_encoding(this);
 9243     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9244   %}
 9245   ins_pipe( pipe_slow );
 9246 %}
 9247 
 9248 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9249   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9250   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9251   effect(TEMP dst);
 9252   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9253   ins_encode %{
 9254     int vector_len = vector_length_encoding(this);
 9255     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9256   %}
 9257   ins_pipe( pipe_slow );
 9258 %}
 9259 
 9260 // --------------------------------- Rotation Operations ----------------------------------
 9261 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9262   match(Set dst (RotateLeftV src shift));
 9263   match(Set dst (RotateRightV src shift));
 9264   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9265   ins_encode %{
 9266     int opcode      = this->ideal_Opcode();
 9267     int vector_len  = vector_length_encoding(this);
 9268     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9269     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9270   %}
 9271   ins_pipe( pipe_slow );
 9272 %}
 9273 
 9274 instruct vprorate(vec dst, vec src, vec shift) %{
 9275   match(Set dst (RotateLeftV src shift));
 9276   match(Set dst (RotateRightV src shift));
 9277   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9278   ins_encode %{
 9279     int opcode      = this->ideal_Opcode();
 9280     int vector_len  = vector_length_encoding(this);
 9281     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9282     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9283   %}
 9284   ins_pipe( pipe_slow );
 9285 %}
 9286 
 9287 // ---------------------------------- Masked Operations ------------------------------------
 9288 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9289   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9290   match(Set dst (LoadVectorMasked mem mask));
 9291   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9292   ins_encode %{
 9293     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9294     int vlen_enc = vector_length_encoding(this);
 9295     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9296   %}
 9297   ins_pipe( pipe_slow );
 9298 %}
 9299 
 9300 
 9301 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9302   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9303   match(Set dst (LoadVectorMasked mem mask));
 9304   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9305   ins_encode %{
 9306     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9307     int vector_len = vector_length_encoding(this);
 9308     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9309   %}
 9310   ins_pipe( pipe_slow );
 9311 %}
 9312 
 9313 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9314   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9315   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9316   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9317   ins_encode %{
 9318     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9319     int vlen_enc = vector_length_encoding(src_node);
 9320     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9321     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9322   %}
 9323   ins_pipe( pipe_slow );
 9324 %}
 9325 
 9326 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9327   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9328   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9329   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9330   ins_encode %{
 9331     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9332     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9333     int vlen_enc = vector_length_encoding(src_node);
 9334     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9335   %}
 9336   ins_pipe( pipe_slow );
 9337 %}
 9338 
 9339 instruct verify_vector_alignment(rRegP addr, immL32 mask, rFlagsReg cr) %{
 9340   match(Set addr (VerifyVectorAlignment addr mask));
 9341   effect(KILL cr);
 9342   format %{ "verify_vector_alignment $addr $mask \t! verify alignment" %}
 9343   ins_encode %{
 9344     Label Lskip;
 9345     // check if masked bits of addr are zero
 9346     __ testq($addr$$Register, $mask$$constant);
 9347     __ jccb(Assembler::equal, Lskip);
 9348     __ stop("verify_vector_alignment found a misaligned vector memory access");
 9349     __ bind(Lskip);
 9350   %}
 9351   ins_pipe(pipe_slow);
 9352 %}
 9353 
 9354 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9355   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9356   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9357   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9358   ins_encode %{
 9359     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9360     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9361 
 9362     Label DONE;
 9363     int vlen_enc = vector_length_encoding(this, $src1);
 9364     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9365 
 9366     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9367     __ mov64($dst$$Register, -1L);
 9368     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9369     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9370     __ jccb(Assembler::carrySet, DONE);
 9371     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9372     __ notq($dst$$Register);
 9373     __ tzcntq($dst$$Register, $dst$$Register);
 9374     __ bind(DONE);
 9375   %}
 9376   ins_pipe( pipe_slow );
 9377 %}
 9378 
 9379 
 9380 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9381   match(Set dst (VectorMaskGen len));
 9382   effect(TEMP temp, KILL cr);
 9383   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9384   ins_encode %{
 9385     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9386   %}
 9387   ins_pipe( pipe_slow );
 9388 %}
 9389 
 9390 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9391   match(Set dst (VectorMaskGen len));
 9392   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9393   effect(TEMP temp);
 9394   ins_encode %{
 9395     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9396     __ kmovql($dst$$KRegister, $temp$$Register);
 9397   %}
 9398   ins_pipe( pipe_slow );
 9399 %}
 9400 
 9401 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9402   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9403   match(Set dst (VectorMaskToLong mask));
 9404   effect(TEMP dst, KILL cr);
 9405   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9406   ins_encode %{
 9407     int opcode = this->ideal_Opcode();
 9408     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9409     int mask_len = Matcher::vector_length(this, $mask);
 9410     int mask_size = mask_len * type2aelembytes(mbt);
 9411     int vlen_enc = vector_length_encoding(this, $mask);
 9412     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9413                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9414   %}
 9415   ins_pipe( pipe_slow );
 9416 %}
 9417 
 9418 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9419   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9420   match(Set dst (VectorMaskToLong mask));
 9421   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9422   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9423   ins_encode %{
 9424     int opcode = this->ideal_Opcode();
 9425     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9426     int mask_len = Matcher::vector_length(this, $mask);
 9427     int vlen_enc = vector_length_encoding(this, $mask);
 9428     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9429                              $dst$$Register, mask_len, mbt, vlen_enc);
 9430   %}
 9431   ins_pipe( pipe_slow );
 9432 %}
 9433 
 9434 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9435   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9436   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9437   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9438   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9439   ins_encode %{
 9440     int opcode = this->ideal_Opcode();
 9441     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9442     int mask_len = Matcher::vector_length(this, $mask);
 9443     int vlen_enc = vector_length_encoding(this, $mask);
 9444     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9445                              $dst$$Register, mask_len, mbt, vlen_enc);
 9446   %}
 9447   ins_pipe( pipe_slow );
 9448 %}
 9449 
 9450 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9451   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9452   match(Set dst (VectorMaskTrueCount mask));
 9453   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9454   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9455   ins_encode %{
 9456     int opcode = this->ideal_Opcode();
 9457     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9458     int mask_len = Matcher::vector_length(this, $mask);
 9459     int mask_size = mask_len * type2aelembytes(mbt);
 9460     int vlen_enc = vector_length_encoding(this, $mask);
 9461     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9462                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9463   %}
 9464   ins_pipe( pipe_slow );
 9465 %}
 9466 
 9467 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9468   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9469   match(Set dst (VectorMaskTrueCount mask));
 9470   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9471   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9472   ins_encode %{
 9473     int opcode = this->ideal_Opcode();
 9474     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9475     int mask_len = Matcher::vector_length(this, $mask);
 9476     int vlen_enc = vector_length_encoding(this, $mask);
 9477     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9478                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9479   %}
 9480   ins_pipe( pipe_slow );
 9481 %}
 9482 
 9483 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9484   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9485   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9486   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9487   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9488   ins_encode %{
 9489     int opcode = this->ideal_Opcode();
 9490     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9491     int mask_len = Matcher::vector_length(this, $mask);
 9492     int vlen_enc = vector_length_encoding(this, $mask);
 9493     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9494                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9495   %}
 9496   ins_pipe( pipe_slow );
 9497 %}
 9498 
 9499 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9500   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9501   match(Set dst (VectorMaskFirstTrue mask));
 9502   match(Set dst (VectorMaskLastTrue mask));
 9503   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9504   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9505   ins_encode %{
 9506     int opcode = this->ideal_Opcode();
 9507     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9508     int mask_len = Matcher::vector_length(this, $mask);
 9509     int mask_size = mask_len * type2aelembytes(mbt);
 9510     int vlen_enc = vector_length_encoding(this, $mask);
 9511     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9512                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9513   %}
 9514   ins_pipe( pipe_slow );
 9515 %}
 9516 
 9517 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9518   predicate(n->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9519   match(Set dst (VectorMaskFirstTrue mask));
 9520   match(Set dst (VectorMaskLastTrue mask));
 9521   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9522   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9523   ins_encode %{
 9524     int opcode = this->ideal_Opcode();
 9525     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9526     int mask_len = Matcher::vector_length(this, $mask);
 9527     int vlen_enc = vector_length_encoding(this, $mask);
 9528     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9529                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9530   %}
 9531   ins_pipe( pipe_slow );
 9532 %}
 9533 
 9534 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9535   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == nullptr);
 9536   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9537   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9538   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9539   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9540   ins_encode %{
 9541     int opcode = this->ideal_Opcode();
 9542     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9543     int mask_len = Matcher::vector_length(this, $mask);
 9544     int vlen_enc = vector_length_encoding(this, $mask);
 9545     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9546                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9547   %}
 9548   ins_pipe( pipe_slow );
 9549 %}
 9550 
 9551 // --------------------------------- Compress/Expand Operations ---------------------------
 9552 instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
 9553   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 9554   match(Set dst (CompressV src mask));
 9555   match(Set dst (ExpandV src mask));
 9556   effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
 9557   format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
 9558   ins_encode %{
 9559     int opcode = this->ideal_Opcode();
 9560     int vlen_enc = vector_length_encoding(this);
 9561     BasicType bt  = Matcher::vector_element_basic_type(this);
 9562     __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
 9563                                    $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
 9564   %}
 9565   ins_pipe( pipe_slow );
 9566 %}
 9567 
 9568 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9569   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 9570   match(Set dst (CompressV src mask));
 9571   match(Set dst (ExpandV src mask));
 9572   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9573   ins_encode %{
 9574     int opcode = this->ideal_Opcode();
 9575     int vector_len = vector_length_encoding(this);
 9576     BasicType bt  = Matcher::vector_element_basic_type(this);
 9577     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9578   %}
 9579   ins_pipe( pipe_slow );
 9580 %}
 9581 
 9582 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9583   match(Set dst (CompressM mask));
 9584   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9585   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9586   ins_encode %{
 9587     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9588     int mask_len = Matcher::vector_length(this);
 9589     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9590   %}
 9591   ins_pipe( pipe_slow );
 9592 %}
 9593 
 9594 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9595 
 9596 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9597   predicate(!VM_Version::supports_gfni());
 9598   match(Set dst (ReverseV src));
 9599   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9600   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9601   ins_encode %{
 9602     int vec_enc = vector_length_encoding(this);
 9603     BasicType bt = Matcher::vector_element_basic_type(this);
 9604     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9605                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9606   %}
 9607   ins_pipe( pipe_slow );
 9608 %}
 9609 
 9610 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9611   predicate(VM_Version::supports_gfni());
 9612   match(Set dst (ReverseV src));
 9613   effect(TEMP dst, TEMP xtmp);
 9614   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9615   ins_encode %{
 9616     int vec_enc = vector_length_encoding(this);
 9617     BasicType bt  = Matcher::vector_element_basic_type(this);
 9618     InternalAddress addr = $constantaddress(jlong(0x8040201008040201));
 9619     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9620                                $xtmp$$XMMRegister);
 9621   %}
 9622   ins_pipe( pipe_slow );
 9623 %}
 9624 
 9625 instruct vreverse_byte_reg(vec dst, vec src) %{
 9626   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9627   match(Set dst (ReverseBytesV src));
 9628   effect(TEMP dst);
 9629   format %{ "vector_reverse_byte $dst, $src" %}
 9630   ins_encode %{
 9631     int vec_enc = vector_length_encoding(this);
 9632     BasicType bt = Matcher::vector_element_basic_type(this);
 9633     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9634   %}
 9635   ins_pipe( pipe_slow );
 9636 %}
 9637 
 9638 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9639   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9640   match(Set dst (ReverseBytesV src));
 9641   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9642   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9643   ins_encode %{
 9644     int vec_enc = vector_length_encoding(this);
 9645     BasicType bt = Matcher::vector_element_basic_type(this);
 9646     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9647                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9648   %}
 9649   ins_pipe( pipe_slow );
 9650 %}
 9651 
 9652 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9653 
 9654 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9655   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9656                                               Matcher::vector_length_in_bytes(n->in(1))));
 9657   match(Set dst (CountLeadingZerosV src));
 9658   format %{ "vector_count_leading_zeros $dst, $src" %}
 9659   ins_encode %{
 9660      int vlen_enc = vector_length_encoding(this, $src);
 9661      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9662      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9663                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9664   %}
 9665   ins_pipe( pipe_slow );
 9666 %}
 9667 
 9668 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9669   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9670                                               Matcher::vector_length_in_bytes(n->in(1))));
 9671   match(Set dst (CountLeadingZerosV src mask));
 9672   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9673   ins_encode %{
 9674     int vlen_enc = vector_length_encoding(this, $src);
 9675     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9676     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9677     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9678                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9679   %}
 9680   ins_pipe( pipe_slow );
 9681 %}
 9682 
 9683 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9684   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9685             VM_Version::supports_avx512cd() &&
 9686             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9687   match(Set dst (CountLeadingZerosV src));
 9688   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9689   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9690   ins_encode %{
 9691     int vlen_enc = vector_length_encoding(this, $src);
 9692     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9693     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9694                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9695   %}
 9696   ins_pipe( pipe_slow );
 9697 %}
 9698 
 9699 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9700   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9701   match(Set dst (CountLeadingZerosV src));
 9702   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9703   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9704   ins_encode %{
 9705     int vlen_enc = vector_length_encoding(this, $src);
 9706     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9707     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9708                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9709                                        $rtmp$$Register, true, vlen_enc);
 9710   %}
 9711   ins_pipe( pipe_slow );
 9712 %}
 9713 
 9714 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9715   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9716             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9717   match(Set dst (CountLeadingZerosV src));
 9718   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9719   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9720   ins_encode %{
 9721     int vlen_enc = vector_length_encoding(this, $src);
 9722     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9723     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9724                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9725   %}
 9726   ins_pipe( pipe_slow );
 9727 %}
 9728 
 9729 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9730   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9731             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9732   match(Set dst (CountLeadingZerosV src));
 9733   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9734   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9735   ins_encode %{
 9736     int vlen_enc = vector_length_encoding(this, $src);
 9737     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9738     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9739                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9740   %}
 9741   ins_pipe( pipe_slow );
 9742 %}
 9743 
 9744 // ---------------------------------- Vector Masked Operations ------------------------------------
 9745 
 9746 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9747   match(Set dst (AddVB (Binary dst src2) mask));
 9748   match(Set dst (AddVS (Binary dst src2) mask));
 9749   match(Set dst (AddVI (Binary dst src2) mask));
 9750   match(Set dst (AddVL (Binary dst src2) mask));
 9751   match(Set dst (AddVF (Binary dst src2) mask));
 9752   match(Set dst (AddVD (Binary dst src2) mask));
 9753   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9754   ins_encode %{
 9755     int vlen_enc = vector_length_encoding(this);
 9756     BasicType bt = Matcher::vector_element_basic_type(this);
 9757     int opc = this->ideal_Opcode();
 9758     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9759                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9760   %}
 9761   ins_pipe( pipe_slow );
 9762 %}
 9763 
 9764 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9765   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9766   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9767   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9768   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9769   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9770   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9771   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9772   ins_encode %{
 9773     int vlen_enc = vector_length_encoding(this);
 9774     BasicType bt = Matcher::vector_element_basic_type(this);
 9775     int opc = this->ideal_Opcode();
 9776     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9777                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9778   %}
 9779   ins_pipe( pipe_slow );
 9780 %}
 9781 
 9782 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9783   match(Set dst (XorV (Binary dst src2) mask));
 9784   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9785   ins_encode %{
 9786     int vlen_enc = vector_length_encoding(this);
 9787     BasicType bt = Matcher::vector_element_basic_type(this);
 9788     int opc = this->ideal_Opcode();
 9789     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9790                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9791   %}
 9792   ins_pipe( pipe_slow );
 9793 %}
 9794 
 9795 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9796   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9797   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9798   ins_encode %{
 9799     int vlen_enc = vector_length_encoding(this);
 9800     BasicType bt = Matcher::vector_element_basic_type(this);
 9801     int opc = this->ideal_Opcode();
 9802     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9803                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9804   %}
 9805   ins_pipe( pipe_slow );
 9806 %}
 9807 
 9808 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9809   match(Set dst (OrV (Binary dst src2) mask));
 9810   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9811   ins_encode %{
 9812     int vlen_enc = vector_length_encoding(this);
 9813     BasicType bt = Matcher::vector_element_basic_type(this);
 9814     int opc = this->ideal_Opcode();
 9815     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9816                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9817   %}
 9818   ins_pipe( pipe_slow );
 9819 %}
 9820 
 9821 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9822   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9823   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9824   ins_encode %{
 9825     int vlen_enc = vector_length_encoding(this);
 9826     BasicType bt = Matcher::vector_element_basic_type(this);
 9827     int opc = this->ideal_Opcode();
 9828     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9829                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9830   %}
 9831   ins_pipe( pipe_slow );
 9832 %}
 9833 
 9834 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9835   match(Set dst (AndV (Binary dst src2) mask));
 9836   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9837   ins_encode %{
 9838     int vlen_enc = vector_length_encoding(this);
 9839     BasicType bt = Matcher::vector_element_basic_type(this);
 9840     int opc = this->ideal_Opcode();
 9841     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9842                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9843   %}
 9844   ins_pipe( pipe_slow );
 9845 %}
 9846 
 9847 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9848   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9849   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9850   ins_encode %{
 9851     int vlen_enc = vector_length_encoding(this);
 9852     BasicType bt = Matcher::vector_element_basic_type(this);
 9853     int opc = this->ideal_Opcode();
 9854     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9855                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9856   %}
 9857   ins_pipe( pipe_slow );
 9858 %}
 9859 
 9860 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9861   match(Set dst (SubVB (Binary dst src2) mask));
 9862   match(Set dst (SubVS (Binary dst src2) mask));
 9863   match(Set dst (SubVI (Binary dst src2) mask));
 9864   match(Set dst (SubVL (Binary dst src2) mask));
 9865   match(Set dst (SubVF (Binary dst src2) mask));
 9866   match(Set dst (SubVD (Binary dst src2) mask));
 9867   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9868   ins_encode %{
 9869     int vlen_enc = vector_length_encoding(this);
 9870     BasicType bt = Matcher::vector_element_basic_type(this);
 9871     int opc = this->ideal_Opcode();
 9872     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9873                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9874   %}
 9875   ins_pipe( pipe_slow );
 9876 %}
 9877 
 9878 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9879   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9880   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9881   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9882   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9883   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9884   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9885   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9886   ins_encode %{
 9887     int vlen_enc = vector_length_encoding(this);
 9888     BasicType bt = Matcher::vector_element_basic_type(this);
 9889     int opc = this->ideal_Opcode();
 9890     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9891                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9892   %}
 9893   ins_pipe( pipe_slow );
 9894 %}
 9895 
 9896 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9897   match(Set dst (MulVS (Binary dst src2) mask));
 9898   match(Set dst (MulVI (Binary dst src2) mask));
 9899   match(Set dst (MulVL (Binary dst src2) mask));
 9900   match(Set dst (MulVF (Binary dst src2) mask));
 9901   match(Set dst (MulVD (Binary dst src2) mask));
 9902   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9903   ins_encode %{
 9904     int vlen_enc = vector_length_encoding(this);
 9905     BasicType bt = Matcher::vector_element_basic_type(this);
 9906     int opc = this->ideal_Opcode();
 9907     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9908                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9909   %}
 9910   ins_pipe( pipe_slow );
 9911 %}
 9912 
 9913 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9914   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9915   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9916   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9917   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9918   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9919   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9920   ins_encode %{
 9921     int vlen_enc = vector_length_encoding(this);
 9922     BasicType bt = Matcher::vector_element_basic_type(this);
 9923     int opc = this->ideal_Opcode();
 9924     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9925                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9926   %}
 9927   ins_pipe( pipe_slow );
 9928 %}
 9929 
 9930 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9931   match(Set dst (SqrtVF dst mask));
 9932   match(Set dst (SqrtVD dst mask));
 9933   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9934   ins_encode %{
 9935     int vlen_enc = vector_length_encoding(this);
 9936     BasicType bt = Matcher::vector_element_basic_type(this);
 9937     int opc = this->ideal_Opcode();
 9938     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9939                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9940   %}
 9941   ins_pipe( pipe_slow );
 9942 %}
 9943 
 9944 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9945   match(Set dst (DivVF (Binary dst src2) mask));
 9946   match(Set dst (DivVD (Binary dst src2) mask));
 9947   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9948   ins_encode %{
 9949     int vlen_enc = vector_length_encoding(this);
 9950     BasicType bt = Matcher::vector_element_basic_type(this);
 9951     int opc = this->ideal_Opcode();
 9952     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9953                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9954   %}
 9955   ins_pipe( pipe_slow );
 9956 %}
 9957 
 9958 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9959   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9960   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9961   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9962   ins_encode %{
 9963     int vlen_enc = vector_length_encoding(this);
 9964     BasicType bt = Matcher::vector_element_basic_type(this);
 9965     int opc = this->ideal_Opcode();
 9966     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9967                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9968   %}
 9969   ins_pipe( pipe_slow );
 9970 %}
 9971 
 9972 
 9973 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9974   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9975   match(Set dst (RotateRightV (Binary dst shift) mask));
 9976   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9977   ins_encode %{
 9978     int vlen_enc = vector_length_encoding(this);
 9979     BasicType bt = Matcher::vector_element_basic_type(this);
 9980     int opc = this->ideal_Opcode();
 9981     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9982                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9983   %}
 9984   ins_pipe( pipe_slow );
 9985 %}
 9986 
 9987 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9988   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9989   match(Set dst (RotateRightV (Binary dst src2) mask));
 9990   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9991   ins_encode %{
 9992     int vlen_enc = vector_length_encoding(this);
 9993     BasicType bt = Matcher::vector_element_basic_type(this);
 9994     int opc = this->ideal_Opcode();
 9995     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9996                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9997   %}
 9998   ins_pipe( pipe_slow );
 9999 %}
10000 
10001 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10002   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
10003   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
10004   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
10005   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
10006   ins_encode %{
10007     int vlen_enc = vector_length_encoding(this);
10008     BasicType bt = Matcher::vector_element_basic_type(this);
10009     int opc = this->ideal_Opcode();
10010     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10011                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10012   %}
10013   ins_pipe( pipe_slow );
10014 %}
10015 
10016 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
10017   predicate(!n->as_ShiftV()->is_var_shift());
10018   match(Set dst (LShiftVS (Binary dst src2) mask));
10019   match(Set dst (LShiftVI (Binary dst src2) mask));
10020   match(Set dst (LShiftVL (Binary dst src2) mask));
10021   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10022   ins_encode %{
10023     int vlen_enc = vector_length_encoding(this);
10024     BasicType bt = Matcher::vector_element_basic_type(this);
10025     int opc = this->ideal_Opcode();
10026     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10027                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10028   %}
10029   ins_pipe( pipe_slow );
10030 %}
10031 
10032 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10033   predicate(n->as_ShiftV()->is_var_shift());
10034   match(Set dst (LShiftVS (Binary dst src2) mask));
10035   match(Set dst (LShiftVI (Binary dst src2) mask));
10036   match(Set dst (LShiftVL (Binary dst src2) mask));
10037   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
10038   ins_encode %{
10039     int vlen_enc = vector_length_encoding(this);
10040     BasicType bt = Matcher::vector_element_basic_type(this);
10041     int opc = this->ideal_Opcode();
10042     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10043                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10044   %}
10045   ins_pipe( pipe_slow );
10046 %}
10047 
10048 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10049   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
10050   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
10051   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
10052   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
10053   ins_encode %{
10054     int vlen_enc = vector_length_encoding(this);
10055     BasicType bt = Matcher::vector_element_basic_type(this);
10056     int opc = this->ideal_Opcode();
10057     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10058                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10059   %}
10060   ins_pipe( pipe_slow );
10061 %}
10062 
10063 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
10064   predicate(!n->as_ShiftV()->is_var_shift());
10065   match(Set dst (RShiftVS (Binary dst src2) mask));
10066   match(Set dst (RShiftVI (Binary dst src2) mask));
10067   match(Set dst (RShiftVL (Binary dst src2) mask));
10068   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10069   ins_encode %{
10070     int vlen_enc = vector_length_encoding(this);
10071     BasicType bt = Matcher::vector_element_basic_type(this);
10072     int opc = this->ideal_Opcode();
10073     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10074                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10075   %}
10076   ins_pipe( pipe_slow );
10077 %}
10078 
10079 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10080   predicate(n->as_ShiftV()->is_var_shift());
10081   match(Set dst (RShiftVS (Binary dst src2) mask));
10082   match(Set dst (RShiftVI (Binary dst src2) mask));
10083   match(Set dst (RShiftVL (Binary dst src2) mask));
10084   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
10085   ins_encode %{
10086     int vlen_enc = vector_length_encoding(this);
10087     BasicType bt = Matcher::vector_element_basic_type(this);
10088     int opc = this->ideal_Opcode();
10089     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10090                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10091   %}
10092   ins_pipe( pipe_slow );
10093 %}
10094 
10095 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
10096   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
10097   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
10098   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
10099   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
10100   ins_encode %{
10101     int vlen_enc = vector_length_encoding(this);
10102     BasicType bt = Matcher::vector_element_basic_type(this);
10103     int opc = this->ideal_Opcode();
10104     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10105                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
10106   %}
10107   ins_pipe( pipe_slow );
10108 %}
10109 
10110 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
10111   predicate(!n->as_ShiftV()->is_var_shift());
10112   match(Set dst (URShiftVS (Binary dst src2) mask));
10113   match(Set dst (URShiftVI (Binary dst src2) mask));
10114   match(Set dst (URShiftVL (Binary dst src2) mask));
10115   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10116   ins_encode %{
10117     int vlen_enc = vector_length_encoding(this);
10118     BasicType bt = Matcher::vector_element_basic_type(this);
10119     int opc = this->ideal_Opcode();
10120     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10121                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
10122   %}
10123   ins_pipe( pipe_slow );
10124 %}
10125 
10126 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
10127   predicate(n->as_ShiftV()->is_var_shift());
10128   match(Set dst (URShiftVS (Binary dst src2) mask));
10129   match(Set dst (URShiftVI (Binary dst src2) mask));
10130   match(Set dst (URShiftVL (Binary dst src2) mask));
10131   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
10132   ins_encode %{
10133     int vlen_enc = vector_length_encoding(this);
10134     BasicType bt = Matcher::vector_element_basic_type(this);
10135     int opc = this->ideal_Opcode();
10136     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10137                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
10138   %}
10139   ins_pipe( pipe_slow );
10140 %}
10141 
10142 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10143   match(Set dst (MaxV (Binary dst src2) mask));
10144   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10145   ins_encode %{
10146     int vlen_enc = vector_length_encoding(this);
10147     BasicType bt = Matcher::vector_element_basic_type(this);
10148     int opc = this->ideal_Opcode();
10149     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10150                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10151   %}
10152   ins_pipe( pipe_slow );
10153 %}
10154 
10155 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10156   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10157   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10158   ins_encode %{
10159     int vlen_enc = vector_length_encoding(this);
10160     BasicType bt = Matcher::vector_element_basic_type(this);
10161     int opc = this->ideal_Opcode();
10162     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10163                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10164   %}
10165   ins_pipe( pipe_slow );
10166 %}
10167 
10168 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10169   match(Set dst (MinV (Binary dst src2) mask));
10170   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10171   ins_encode %{
10172     int vlen_enc = vector_length_encoding(this);
10173     BasicType bt = Matcher::vector_element_basic_type(this);
10174     int opc = this->ideal_Opcode();
10175     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10176                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10177   %}
10178   ins_pipe( pipe_slow );
10179 %}
10180 
10181 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10182   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10183   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10184   ins_encode %{
10185     int vlen_enc = vector_length_encoding(this);
10186     BasicType bt = Matcher::vector_element_basic_type(this);
10187     int opc = this->ideal_Opcode();
10188     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10189                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10190   %}
10191   ins_pipe( pipe_slow );
10192 %}
10193 
10194 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10195   match(Set dst (VectorRearrange (Binary dst src2) mask));
10196   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10197   ins_encode %{
10198     int vlen_enc = vector_length_encoding(this);
10199     BasicType bt = Matcher::vector_element_basic_type(this);
10200     int opc = this->ideal_Opcode();
10201     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10202                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10203   %}
10204   ins_pipe( pipe_slow );
10205 %}
10206 
10207 instruct vabs_masked(vec dst, kReg mask) %{
10208   match(Set dst (AbsVB dst mask));
10209   match(Set dst (AbsVS dst mask));
10210   match(Set dst (AbsVI dst mask));
10211   match(Set dst (AbsVL dst mask));
10212   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10213   ins_encode %{
10214     int vlen_enc = vector_length_encoding(this);
10215     BasicType bt = Matcher::vector_element_basic_type(this);
10216     int opc = this->ideal_Opcode();
10217     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10218                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10219   %}
10220   ins_pipe( pipe_slow );
10221 %}
10222 
10223 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10224   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10225   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10226   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10227   ins_encode %{
10228     assert(UseFMA, "Needs FMA instructions support.");
10229     int vlen_enc = vector_length_encoding(this);
10230     BasicType bt = Matcher::vector_element_basic_type(this);
10231     int opc = this->ideal_Opcode();
10232     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10233                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10234   %}
10235   ins_pipe( pipe_slow );
10236 %}
10237 
10238 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10239   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10240   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10241   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10242   ins_encode %{
10243     assert(UseFMA, "Needs FMA instructions support.");
10244     int vlen_enc = vector_length_encoding(this);
10245     BasicType bt = Matcher::vector_element_basic_type(this);
10246     int opc = this->ideal_Opcode();
10247     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10248                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10249   %}
10250   ins_pipe( pipe_slow );
10251 %}
10252 
10253 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10254   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10255   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10256   ins_encode %{
10257     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10258     int vlen_enc = vector_length_encoding(this, $src1);
10259     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10260 
10261     // Comparison i
10262     switch (src1_elem_bt) {
10263       case T_BYTE: {
10264         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10265         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10266         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10267         break;
10268       }
10269       case T_SHORT: {
10270         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10271         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10272         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10273         break;
10274       }
10275       case T_INT: {
10276         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10277         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10278         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10279         break;
10280       }
10281       case T_LONG: {
10282         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
10283         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10284         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10285         break;
10286       }
10287       case T_FLOAT: {
10288         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10289         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10290         break;
10291       }
10292       case T_DOUBLE: {
10293         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10294         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10295         break;
10296       }
10297       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10298     }
10299   %}
10300   ins_pipe( pipe_slow );
10301 %}
10302 
10303 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10304   predicate(Matcher::vector_length(n) <= 32);
10305   match(Set dst (MaskAll src));
10306   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10307   ins_encode %{
10308     int mask_len = Matcher::vector_length(this);
10309     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10310   %}
10311   ins_pipe( pipe_slow );
10312 %}
10313 
10314 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10315   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10316   match(Set dst (XorVMask src (MaskAll cnt)));
10317   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10318   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10319   ins_encode %{
10320     uint masklen = Matcher::vector_length(this);
10321     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10322   %}
10323   ins_pipe( pipe_slow );
10324 %}
10325 
10326 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10327   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10328             (Matcher::vector_length(n) == 16) ||
10329             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10330   match(Set dst (XorVMask src (MaskAll cnt)));
10331   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10332   ins_encode %{
10333     uint masklen = Matcher::vector_length(this);
10334     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10335   %}
10336   ins_pipe( pipe_slow );
10337 %}
10338 
10339 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10340   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) <= 8);
10341   match(Set dst (VectorLongToMask src));
10342   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10343   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10344   ins_encode %{
10345     int mask_len = Matcher::vector_length(this);
10346     int vec_enc  = vector_length_encoding(mask_len);
10347     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10348                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10349   %}
10350   ins_pipe( pipe_slow );
10351 %}
10352 
10353 
10354 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10355   predicate(n->bottom_type()->isa_vectmask() == nullptr && Matcher::vector_length(n) > 8);
10356   match(Set dst (VectorLongToMask src));
10357   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10358   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10359   ins_encode %{
10360     int mask_len = Matcher::vector_length(this);
10361     assert(mask_len <= 32, "invalid mask length");
10362     int vec_enc  = vector_length_encoding(mask_len);
10363     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10364                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10365   %}
10366   ins_pipe( pipe_slow );
10367 %}
10368 
10369 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10370   predicate(n->bottom_type()->isa_vectmask());
10371   match(Set dst (VectorLongToMask src));
10372   format %{ "long_to_mask_evex $dst, $src\t!" %}
10373   ins_encode %{
10374     __ kmov($dst$$KRegister, $src$$Register);
10375   %}
10376   ins_pipe( pipe_slow );
10377 %}
10378 
10379 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10380   match(Set dst (AndVMask src1 src2));
10381   match(Set dst (OrVMask src1 src2));
10382   match(Set dst (XorVMask src1 src2));
10383   effect(TEMP kscratch);
10384   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10385   ins_encode %{
10386     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10387     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10388     assert(Type::equals(mask1->bottom_type(), mask2->bottom_type()), "Mask types must be equal");
10389     uint masklen = Matcher::vector_length(this);
10390     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10391     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10392   %}
10393   ins_pipe( pipe_slow );
10394 %}
10395 
10396 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10397   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10398   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10399   ins_encode %{
10400     int vlen_enc = vector_length_encoding(this);
10401     BasicType bt = Matcher::vector_element_basic_type(this);
10402     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10403                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10404   %}
10405   ins_pipe( pipe_slow );
10406 %}
10407 
10408 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10409   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10410   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10411   ins_encode %{
10412     int vlen_enc = vector_length_encoding(this);
10413     BasicType bt = Matcher::vector_element_basic_type(this);
10414     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10415                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10416   %}
10417   ins_pipe( pipe_slow );
10418 %}
10419 
10420 instruct castMM(kReg dst)
10421 %{
10422   match(Set dst (CastVV dst));
10423 
10424   size(0);
10425   format %{ "# castVV of $dst" %}
10426   ins_encode(/* empty encoding */);
10427   ins_cost(0);
10428   ins_pipe(empty);
10429 %}
10430 
10431 instruct castVV(vec dst)
10432 %{
10433   match(Set dst (CastVV dst));
10434 
10435   size(0);
10436   format %{ "# castVV of $dst" %}
10437   ins_encode(/* empty encoding */);
10438   ins_cost(0);
10439   ins_pipe(empty);
10440 %}
10441 
10442 instruct castVVLeg(legVec dst)
10443 %{
10444   match(Set dst (CastVV dst));
10445 
10446   size(0);
10447   format %{ "# castVV of $dst" %}
10448   ins_encode(/* empty encoding */);
10449   ins_cost(0);
10450   ins_pipe(empty);
10451 %}
10452 
10453 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10454 %{
10455   match(Set dst (IsInfiniteF src));
10456   effect(TEMP ktmp, KILL cr);
10457   format %{ "float_class_check $dst, $src" %}
10458   ins_encode %{
10459     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10460     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10461   %}
10462   ins_pipe(pipe_slow);
10463 %}
10464 
10465 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10466 %{
10467   match(Set dst (IsInfiniteD src));
10468   effect(TEMP ktmp, KILL cr);
10469   format %{ "double_class_check $dst, $src" %}
10470   ins_encode %{
10471     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10472     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10473   %}
10474   ins_pipe(pipe_slow);
10475 %}
10476 
10477 instruct vector_addsub_saturating_subword_reg(vec dst, vec src1, vec src2)
10478 %{
10479   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10480             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10481   match(Set dst (SaturatingAddV src1 src2));
10482   match(Set dst (SaturatingSubV src1 src2));
10483   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10484   ins_encode %{
10485     int vlen_enc = vector_length_encoding(this);
10486     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10487     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10488                             $src1$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10489   %}
10490   ins_pipe(pipe_slow);
10491 %}
10492 
10493 instruct vector_addsub_saturating_unsigned_subword_reg(vec dst, vec src1, vec src2)
10494 %{
10495   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10496             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10497   match(Set dst (SaturatingAddV src1 src2));
10498   match(Set dst (SaturatingSubV src1 src2));
10499   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10500   ins_encode %{
10501     int vlen_enc = vector_length_encoding(this);
10502     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10503     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10504                             $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10505   %}
10506   ins_pipe(pipe_slow);
10507 %}
10508 
10509 instruct vector_addsub_saturating_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2)
10510 %{
10511   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10512             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10513             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10514   match(Set dst (SaturatingAddV src1 src2));
10515   match(Set dst (SaturatingSubV src1 src2));
10516   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2);
10517   format %{ "vector_addsub_saturating_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
10518   ins_encode %{
10519     int vlen_enc = vector_length_encoding(this);
10520     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10521     __ vector_addsub_dq_saturating_evex(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10522                                         $src1$$XMMRegister, $src2$$XMMRegister,
10523                                         $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10524                                         $ktmp1$$KRegister, $ktmp2$$KRegister, vlen_enc);
10525   %}
10526   ins_pipe(pipe_slow);
10527 %}
10528 
10529 instruct vector_addsub_saturating_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4)
10530 %{
10531   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10532             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned() &&
10533             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10534   match(Set dst (SaturatingAddV src1 src2));
10535   match(Set dst (SaturatingSubV src1 src2));
10536   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4);
10537   format %{ "vector_addsub_saturating_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
10538   ins_encode %{
10539     int vlen_enc = vector_length_encoding(this);
10540     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10541     __ vector_addsub_dq_saturating_avx(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10542                                        $src2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
10543                                        $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, vlen_enc);
10544   %}
10545   ins_pipe(pipe_slow);
10546 %}
10547 
10548 instruct vector_add_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, kReg ktmp)
10549 %{
10550   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10551             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10552             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10553   match(Set dst (SaturatingAddV src1 src2));
10554   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp);
10555   format %{ "vector_add_saturating_unsigned_evex $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $ktmp as TEMP" %}
10556   ins_encode %{
10557     int vlen_enc = vector_length_encoding(this);
10558     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10559     __ vector_add_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10560                                               $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10561   %}
10562   ins_pipe(pipe_slow);
10563 %}
10564 
10565 instruct vector_add_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2, vec xtmp3)
10566 %{
10567   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10568             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10569             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10570   match(Set dst (SaturatingAddV src1 src2));
10571   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
10572   format %{ "vector_add_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
10573   ins_encode %{
10574     int vlen_enc = vector_length_encoding(this);
10575     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10576     __ vector_add_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10577                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, vlen_enc);
10578   %}
10579   ins_pipe(pipe_slow);
10580 %}
10581 
10582 instruct vector_sub_saturating_unsigned_reg_evex(vec dst, vec src1, vec src2, kReg ktmp)
10583 %{
10584   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10585             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10586             (Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()));
10587   match(Set dst (SaturatingSubV src1 src2));
10588   effect(TEMP ktmp);
10589   format %{ "vector_sub_saturating_unsigned_evex $dst, $src1, $src2 \t! using $ktmp as TEMP" %}
10590   ins_encode %{
10591     int vlen_enc = vector_length_encoding(this);
10592     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10593     __ vector_sub_dq_saturating_unsigned_evex(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister,
10594                                               $src2$$XMMRegister, $ktmp$$KRegister, vlen_enc);
10595   %}
10596   ins_pipe(pipe_slow);
10597 %}
10598 
10599 instruct vector_sub_saturating_unsigned_reg_avx(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2)
10600 %{
10601   predicate(!is_subword_type(Matcher::vector_element_basic_type(n)) &&
10602             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned() &&
10603             Matcher::vector_length_in_bytes(n) <= 32 && !VM_Version::supports_avx512vl());
10604   match(Set dst (SaturatingSubV src1 src2));
10605   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
10606   format %{ "vector_sub_saturating_unsigned_avx $dst, $src1, $src2 \t! using $xtmp1 and $xtmp2 as TEMP" %}
10607   ins_encode %{
10608     int vlen_enc = vector_length_encoding(this);
10609     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10610     __ vector_sub_dq_saturating_unsigned_avx(elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
10611                                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10612   %}
10613   ins_pipe(pipe_slow);
10614 %}
10615 
10616 instruct vector_addsub_saturating_subword_mem(vec dst, vec src1, memory src2)
10617 %{
10618   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10619             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10620   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10621   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10622   format %{ "vector_addsub_saturating_subword $dst, $src1, $src2" %}
10623   ins_encode %{
10624     int vlen_enc = vector_length_encoding(this);
10625     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10626     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10627                             $src1$$XMMRegister, $src2$$Address, false, vlen_enc);
10628   %}
10629   ins_pipe(pipe_slow);
10630 %}
10631 
10632 instruct vector_addsub_saturating_unsigned_subword_mem(vec dst, vec src1, memory src2)
10633 %{
10634   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10635             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10636   match(Set dst (SaturatingAddV src1 (LoadVector src2)));
10637   match(Set dst (SaturatingSubV src1 (LoadVector src2)));
10638   format %{ "vector_addsub_saturating_unsigned_subword $dst, $src1, $src2" %}
10639   ins_encode %{
10640     int vlen_enc = vector_length_encoding(this);
10641     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10642     __ vector_saturating_op(this->ideal_Opcode(), elem_bt, $dst$$XMMRegister,
10643                             $src1$$XMMRegister, $src2$$Address, true, vlen_enc);
10644   %}
10645   ins_pipe(pipe_slow);
10646 %}
10647 
10648 instruct vector_addsub_saturating_subword_masked_reg(vec dst, vec src, kReg mask) %{
10649   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10650             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10651   match(Set dst (SaturatingAddV (Binary dst src) mask));
10652   match(Set dst (SaturatingSubV (Binary dst src) mask));
10653   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10654   ins_encode %{
10655     int vlen_enc = vector_length_encoding(this);
10656     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10657     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10658                               $dst$$XMMRegister, $src$$XMMRegister, false, true, vlen_enc);
10659   %}
10660   ins_pipe( pipe_slow );
10661 %}
10662 
10663 instruct vector_addsub_saturating_unsigned_subword_masked_reg(vec dst, vec src, kReg mask) %{
10664   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10665             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10666   match(Set dst (SaturatingAddV (Binary dst src) mask));
10667   match(Set dst (SaturatingSubV (Binary dst src) mask));
10668   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10669   ins_encode %{
10670     int vlen_enc = vector_length_encoding(this);
10671     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10672     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10673                               $dst$$XMMRegister, $src$$XMMRegister, true, true, vlen_enc);
10674   %}
10675   ins_pipe( pipe_slow );
10676 %}
10677 
10678 instruct vector_addsub_saturating_subword_masked_mem(vec dst, memory src, kReg mask) %{
10679   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10680             n->is_SaturatingVector() && !n->as_SaturatingVector()->is_unsigned());
10681   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10682   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10683   format %{ "vector_addsub_saturating_subword_masked $dst, $mask, $src" %}
10684   ins_encode %{
10685     int vlen_enc = vector_length_encoding(this);
10686     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10687     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10688                               $dst$$XMMRegister, $src$$Address, false, true, vlen_enc);
10689   %}
10690   ins_pipe( pipe_slow );
10691 %}
10692 
10693 instruct vector_addsub_saturating_unsigned_subword_masked_mem(vec dst, memory src, kReg mask) %{
10694   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) &&
10695             n->is_SaturatingVector() && n->as_SaturatingVector()->is_unsigned());
10696   match(Set dst (SaturatingAddV (Binary dst (LoadVector src)) mask));
10697   match(Set dst (SaturatingSubV (Binary dst (LoadVector src)) mask));
10698   format %{ "vector_addsub_saturating_unsigned_subword_masked $dst, $mask, $src" %}
10699   ins_encode %{
10700     int vlen_enc = vector_length_encoding(this);
10701     BasicType elem_bt = Matcher::vector_element_basic_type(this);
10702     __ evmasked_saturating_op(this->ideal_Opcode(), elem_bt, $mask$$KRegister, $dst$$XMMRegister,
10703                               $dst$$XMMRegister, $src$$Address, true, true, vlen_enc);
10704   %}
10705   ins_pipe( pipe_slow );
10706 %}
10707 
10708 instruct vector_selectfrom_twovectors_reg_evex(vec index, vec src1, vec src2)
10709 %{
10710   match(Set index (SelectFromTwoVector (Binary index src1) src2));
10711   format %{ "select_from_two_vector $index, $src1, $src2 \t!" %}
10712   ins_encode %{
10713     int vlen_enc = vector_length_encoding(this);
10714     BasicType bt = Matcher::vector_element_basic_type(this);
10715     __ select_from_two_vectors_evex(bt, $index$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10716   %}
10717   ins_pipe(pipe_slow);
10718 %}
10719 
10720 instruct reinterpretS2HF(regF dst, rRegI src)
10721 %{
10722   match(Set dst (ReinterpretS2HF src));
10723   format %{ "vmovw $dst, $src" %}
10724   ins_encode %{
10725     __ vmovw($dst$$XMMRegister, $src$$Register);
10726   %}
10727   ins_pipe(pipe_slow);
10728 %}
10729 
10730 instruct reinterpretHF2S(rRegI dst, regF src)
10731 %{
10732   match(Set dst (ReinterpretHF2S src));
10733   format %{ "vmovw $dst, $src" %}
10734   ins_encode %{
10735     __ vmovw($dst$$Register, $src$$XMMRegister);
10736   %}
10737   ins_pipe(pipe_slow);
10738 %}
10739 
10740 instruct convF2HFAndS2HF(regF dst, regF src)
10741 %{
10742   match(Set dst (ReinterpretS2HF (ConvF2HF src)));
10743   format %{ "convF2HFAndS2HF $dst, $src" %}
10744   ins_encode %{
10745     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
10746   %}
10747   ins_pipe(pipe_slow);
10748 %}
10749 
10750 instruct convHF2SAndHF2F(regF dst, regF src)
10751 %{
10752   match(Set dst (ConvHF2F (ReinterpretHF2S src)));
10753   format %{ "convHF2SAndHF2F $dst, $src" %}
10754   ins_encode %{
10755     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, Assembler::AVX_128bit);
10756   %}
10757   ins_pipe(pipe_slow);
10758 %}
10759 
10760 instruct scalar_sqrt_HF_reg(regF dst, regF src)
10761 %{
10762   match(Set dst (SqrtHF src));
10763   format %{ "scalar_sqrt_fp16 $dst, $src" %}
10764   ins_encode %{
10765     __ vsqrtsh($dst$$XMMRegister, $src$$XMMRegister);
10766   %}
10767   ins_pipe(pipe_slow);
10768 %}
10769 
10770 instruct scalar_binOps_HF_reg(regF dst, regF src1, regF src2)
10771 %{
10772   match(Set dst (AddHF src1 src2));
10773   match(Set dst (DivHF src1 src2));
10774   match(Set dst (MulHF src1 src2));
10775   match(Set dst (SubHF src1 src2));
10776   format %{ "scalar_binop_fp16 $dst, $src1, $src2" %}
10777   ins_encode %{
10778     int opcode = this->ideal_Opcode();
10779     __ efp16sh(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
10780   %}
10781   ins_pipe(pipe_slow);
10782 %}
10783 
10784 instruct scalar_minmax_HF_avx10_reg(regF dst, regF src1, regF src2)
10785 %{
10786   predicate(VM_Version::supports_avx10_2());
10787   match(Set dst (MaxHF src1 src2));
10788   match(Set dst (MinHF src1 src2));
10789   format %{ "scalar_min_max_fp16 $dst, $src1, $src2" %}
10790   ins_encode %{
10791     int function = this->ideal_Opcode() == Op_MinHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10792     __ eminmaxsh($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, function);
10793   %}
10794   ins_pipe( pipe_slow );
10795 %}
10796 
10797 instruct scalar_minmax_HF_reg(regF dst, regF src1, regF src2, kReg ktmp, regF xtmp1, regF xtmp2)
10798 %{
10799   predicate(!VM_Version::supports_avx10_2());
10800   match(Set dst (MaxHF src1 src2));
10801   match(Set dst (MinHF src1 src2));
10802   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10803   format %{ "scalar_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10804   ins_encode %{
10805     int opcode = this->ideal_Opcode();
10806     __ scalar_max_min_fp16(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $ktmp$$KRegister,
10807                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister);
10808   %}
10809   ins_pipe( pipe_slow );
10810 %}
10811 
10812 instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
10813 %{
10814   match(Set dst (FmaHF  src2 (Binary dst src1)));
10815   effect(DEF dst);
10816   format %{ "scalar_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10817   ins_encode %{
10818     __ vfmadd132sh($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister);
10819   %}
10820   ins_pipe( pipe_slow );
10821 %}
10822 
10823 
10824 instruct vector_sqrt_HF_reg(vec dst, vec src)
10825 %{
10826   match(Set dst (SqrtVHF src));
10827   format %{ "vector_sqrt_fp16 $dst, $src" %}
10828   ins_encode %{
10829     int vlen_enc = vector_length_encoding(this);
10830     __ evsqrtph($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
10831   %}
10832   ins_pipe(pipe_slow);
10833 %}
10834 
10835 instruct vector_sqrt_HF_mem(vec dst, memory src)
10836 %{
10837   match(Set dst (SqrtVHF (VectorReinterpret (LoadVector src))));
10838   format %{ "vector_sqrt_fp16_mem $dst, $src" %}
10839   ins_encode %{
10840     int vlen_enc = vector_length_encoding(this);
10841     __ evsqrtph($dst$$XMMRegister, $src$$Address, vlen_enc);
10842   %}
10843   ins_pipe(pipe_slow);
10844 %}
10845 
10846 instruct vector_binOps_HF_reg(vec dst, vec src1, vec src2)
10847 %{
10848   match(Set dst (AddVHF src1 src2));
10849   match(Set dst (DivVHF src1 src2));
10850   match(Set dst (MulVHF src1 src2));
10851   match(Set dst (SubVHF src1 src2));
10852   format %{ "vector_binop_fp16 $dst, $src1, $src2" %}
10853   ins_encode %{
10854     int vlen_enc = vector_length_encoding(this);
10855     int opcode = this->ideal_Opcode();
10856     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
10857   %}
10858   ins_pipe(pipe_slow);
10859 %}
10860 
10861 
10862 instruct vector_binOps_HF_mem(vec dst, vec src1, memory src2)
10863 %{
10864   match(Set dst (AddVHF src1 (VectorReinterpret (LoadVector src2))));
10865   match(Set dst (DivVHF src1 (VectorReinterpret (LoadVector src2))));
10866   match(Set dst (MulVHF src1 (VectorReinterpret (LoadVector src2))));
10867   match(Set dst (SubVHF src1 (VectorReinterpret (LoadVector src2))));
10868   format %{ "vector_binop_fp16_mem $dst, $src1, $src2" %}
10869   ins_encode %{
10870     int vlen_enc = vector_length_encoding(this);
10871     int opcode = this->ideal_Opcode();
10872     __ evfp16ph(opcode, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address, vlen_enc);
10873   %}
10874   ins_pipe(pipe_slow);
10875 %}
10876 
10877 instruct vector_fma_HF_reg(vec dst, vec src1, vec src2)
10878 %{
10879   match(Set dst (FmaVHF src2 (Binary dst src1)));
10880   format %{ "vector_fma_fp16 $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10881   ins_encode %{
10882     int vlen_enc = vector_length_encoding(this);
10883     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
10884   %}
10885   ins_pipe( pipe_slow );
10886 %}
10887 
10888 instruct vector_fma_HF_mem(vec dst, memory src1, vec src2)
10889 %{
10890   match(Set dst (FmaVHF src2 (Binary dst (VectorReinterpret (LoadVector src1)))));
10891   format %{ "vector_fma_fp16_mem $dst, $src1, $src2\t# $dst = $dst * $src1 + $src2 fma packedH" %}
10892   ins_encode %{
10893     int vlen_enc = vector_length_encoding(this);
10894     __ evfmadd132ph($dst$$XMMRegister, $src2$$XMMRegister, $src1$$Address, vlen_enc);
10895   %}
10896   ins_pipe( pipe_slow );
10897 %}
10898 
10899 instruct vector_minmax_HF_avx10_mem(vec dst, vec src1, memory src2)
10900 %{
10901   predicate(VM_Version::supports_avx10_2());
10902   match(Set dst (MinVHF src1 (VectorReinterpret (LoadVector src2))));
10903   match(Set dst (MaxVHF src1 (VectorReinterpret (LoadVector src2))));
10904   format %{ "vector_min_max_fp16_mem $dst, $src1, $src2" %}
10905   ins_encode %{
10906     int vlen_enc = vector_length_encoding(this);
10907     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10908     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$Address, true, function, vlen_enc);
10909   %}
10910   ins_pipe( pipe_slow );
10911 %}
10912 
10913 instruct vector_minmax_HF_avx10_reg(vec dst, vec src1, vec src2)
10914 %{
10915   predicate(VM_Version::supports_avx10_2());
10916   match(Set dst (MinVHF src1 src2));
10917   match(Set dst (MaxVHF src1 src2));
10918   format %{ "vector_min_max_fp16 $dst, $src1, $src2" %}
10919   ins_encode %{
10920     int vlen_enc = vector_length_encoding(this);
10921     int function =  this->ideal_Opcode() == Op_MinVHF ? AVX10_MINMAX_MIN_COMPARE_SIGN : AVX10_MINMAX_MAX_COMPARE_SIGN;
10922     __ evminmaxph($dst$$XMMRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, true, function, vlen_enc);
10923   %}
10924   ins_pipe( pipe_slow );
10925 %}
10926 
10927 instruct vector_minmax_HF_reg(vec dst, vec src1, vec src2, kReg ktmp, vec xtmp1, vec xtmp2)
10928 %{
10929   predicate(!VM_Version::supports_avx10_2());
10930   match(Set dst (MinVHF src1 src2));
10931   match(Set dst (MaxVHF src1 src2));
10932   effect(TEMP_DEF dst, TEMP ktmp, TEMP xtmp1, TEMP xtmp2);
10933   format %{ "vector_min_max_fp16 $dst, $src1, $src2\t using $ktmp, $xtmp1 and $xtmp2 as TEMP" %}
10934   ins_encode %{
10935     int vlen_enc = vector_length_encoding(this);
10936     int opcode = this->ideal_Opcode();
10937     __ vector_max_min_fp16(opcode, $dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $ktmp$$KRegister,
10938                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
10939   %}
10940   ins_pipe( pipe_slow );
10941 %}