1 //
    2 // Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM15 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1250   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1251            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1252 }
 1253 
 1254 class Node::PD {
 1255 public:
 1256   enum NodeFlags {
 1257     Flag_intel_jcc_erratum    = Node::_last_flag << 1,
 1258     Flag_sets_carry_flag      = Node::_last_flag << 2,
 1259     Flag_sets_parity_flag     = Node::_last_flag << 3,
 1260     Flag_sets_zero_flag       = Node::_last_flag << 4,
 1261     Flag_sets_overflow_flag   = Node::_last_flag << 5,
 1262     Flag_sets_sign_flag       = Node::_last_flag << 6,
 1263     Flag_clears_carry_flag    = Node::_last_flag << 7,
 1264     Flag_clears_parity_flag   = Node::_last_flag << 8,
 1265     Flag_clears_zero_flag     = Node::_last_flag << 9,
 1266     Flag_clears_overflow_flag = Node::_last_flag << 10,
 1267     Flag_clears_sign_flag     = Node::_last_flag << 11,
 1268     _last_flag                = Flag_clears_sign_flag
 1269   };
 1270 };
 1271 
 1272 %} // end source_hpp
 1273 
 1274 source %{
 1275 
 1276 #include "opto/addnode.hpp"
 1277 #include "c2_intelJccErratum_x86.hpp"
 1278 
 1279 void PhaseOutput::pd_perform_mach_node_analysis() {
 1280   if (VM_Version::has_intel_jcc_erratum()) {
 1281     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1282     _buf_sizes._code += extra_padding;
 1283   }
 1284 }
 1285 
 1286 int MachNode::pd_alignment_required() const {
 1287   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1288     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1289     return IntelJccErratum::largest_jcc_size() + 1;
 1290   } else {
 1291     return 1;
 1292   }
 1293 }
 1294 
 1295 int MachNode::compute_padding(int current_offset) const {
 1296   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1297     Compile* C = Compile::current();
 1298     PhaseOutput* output = C->output();
 1299     Block* block = output->block();
 1300     int index = output->index();
 1301     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1302   } else {
 1303     return 0;
 1304   }
 1305 }
 1306 
 1307 // Emit exception handler code.
 1308 // Stuff framesize into a register and call a VM stub routine.
 1309 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1310 
 1311   // Note that the code buffer's insts_mark is always relative to insts.
 1312   // That's why we must use the macroassembler to generate a handler.
 1313   C2_MacroAssembler _masm(&cbuf);
 1314   address base = __ start_a_stub(size_exception_handler());
 1315   if (base == NULL) {
 1316     ciEnv::current()->record_failure("CodeCache is full");
 1317     return 0;  // CodeBuffer::expand failed
 1318   }
 1319   int offset = __ offset();
 1320   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1321   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1322   __ end_a_stub();
 1323   return offset;
 1324 }
 1325 
 1326 // Emit deopt handler code.
 1327 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1328 
 1329   // Note that the code buffer's insts_mark is always relative to insts.
 1330   // That's why we must use the macroassembler to generate a handler.
 1331   C2_MacroAssembler _masm(&cbuf);
 1332   address base = __ start_a_stub(size_deopt_handler());
 1333   if (base == NULL) {
 1334     ciEnv::current()->record_failure("CodeCache is full");
 1335     return 0;  // CodeBuffer::expand failed
 1336   }
 1337   int offset = __ offset();
 1338 
 1339 #ifdef _LP64
 1340   address the_pc = (address) __ pc();
 1341   Label next;
 1342   // push a "the_pc" on the stack without destroying any registers
 1343   // as they all may be live.
 1344 
 1345   // push address of "next"
 1346   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1347   __ bind(next);
 1348   // adjust it so it matches "the_pc"
 1349   __ subptr(Address(rsp, 0), __ offset() - offset);
 1350 #else
 1351   InternalAddress here(__ pc());
 1352   __ pushptr(here.addr(), noreg);
 1353 #endif
 1354 
 1355   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1356   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1357   __ end_a_stub();
 1358   return offset;
 1359 }
 1360 
 1361 Assembler::Width widthForType(BasicType bt) {
 1362   if (bt == T_BYTE) {
 1363     return Assembler::B;
 1364   } else if (bt == T_SHORT) {
 1365     return Assembler::W;
 1366   } else if (bt == T_INT) {
 1367     return Assembler::D;
 1368   } else {
 1369     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1370     return Assembler::Q;
 1371   }
 1372 }
 1373 
 1374 //=============================================================================
 1375 
 1376   // Float masks come from different places depending on platform.
 1377 #ifdef _LP64
 1378   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1379   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1380   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1381   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1382 #else
 1383   static address float_signmask()  { return (address)float_signmask_pool; }
 1384   static address float_signflip()  { return (address)float_signflip_pool; }
 1385   static address double_signmask() { return (address)double_signmask_pool; }
 1386   static address double_signflip() { return (address)double_signflip_pool; }
 1387 #endif
 1388   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1389   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1390   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1391   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1392   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1393   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1394   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1395   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1396   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1397   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1398   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1399   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1400   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1401   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1402   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1403 
 1404 //=============================================================================
 1405 bool Matcher::match_rule_supported(int opcode) {
 1406   if (!has_match_rule(opcode)) {
 1407     return false; // no match rule present
 1408   }
 1409   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1410   switch (opcode) {
 1411     case Op_AbsVL:
 1412     case Op_StoreVectorScatter:
 1413       if (UseAVX < 3) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountI:
 1418     case Op_PopCountL:
 1419       if (!UsePopCountInstruction) {
 1420         return false;
 1421       }
 1422       break;
 1423     case Op_PopCountVI:
 1424       if (UseAVX < 2) {
 1425         return false;
 1426       }
 1427       break;
 1428     case Op_PopCountVL:
 1429       if (UseAVX < 2) {
 1430         return false;
 1431       }
 1432       break;
 1433     case Op_MulVI:
 1434       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1435         return false;
 1436       }
 1437       break;
 1438     case Op_MulVL:
 1439       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1440         return false;
 1441       }
 1442       break;
 1443     case Op_MulReductionVL:
 1444       if (VM_Version::supports_avx512dq() == false) {
 1445         return false;
 1446       }
 1447       break;
 1448     case Op_AddReductionVL:
 1449       if (UseSSE < 2) { // requires at least SSE2
 1450         return false;
 1451       }
 1452       break;
 1453     case Op_AbsVB:
 1454     case Op_AbsVS:
 1455     case Op_AbsVI:
 1456     case Op_AddReductionVI:
 1457     case Op_AndReductionV:
 1458     case Op_OrReductionV:
 1459     case Op_XorReductionV:
 1460       if (UseSSE < 3) { // requires at least SSSE3
 1461         return false;
 1462       }
 1463       break;
 1464     case Op_VectorLoadShuffle:
 1465     case Op_VectorRearrange:
 1466     case Op_MulReductionVI:
 1467       if (UseSSE < 4) { // requires at least SSE4
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_IsInfiniteF:
 1472     case Op_IsInfiniteD:
 1473       if (!VM_Version::supports_avx512dq()) {
 1474         return false;
 1475       }
 1476       break;
 1477     case Op_SqrtVD:
 1478     case Op_SqrtVF:
 1479     case Op_VectorMaskCmp:
 1480     case Op_VectorCastB2X:
 1481     case Op_VectorCastS2X:
 1482     case Op_VectorCastI2X:
 1483     case Op_VectorCastL2X:
 1484     case Op_VectorCastF2X:
 1485     case Op_VectorCastD2X:
 1486     case Op_VectorUCastB2X:
 1487     case Op_VectorUCastS2X:
 1488     case Op_VectorUCastI2X:
 1489     case Op_VectorMaskCast:
 1490       if (UseAVX < 1) { // enabled for AVX only
 1491         return false;
 1492       }
 1493       break;
 1494     case Op_PopulateIndex:
 1495       if (!is_LP64 || (UseAVX < 2)) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_RoundVF:
 1500       if (UseAVX < 2) { // enabled for AVX2 only
 1501         return false;
 1502       }
 1503       break;
 1504     case Op_RoundVD:
 1505       if (UseAVX < 3) {
 1506         return false;  // enabled for AVX3 only
 1507       }
 1508       break;
 1509     case Op_CompareAndSwapL:
 1510 #ifdef _LP64
 1511     case Op_CompareAndSwapP:
 1512 #endif
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572       if (UseAVX < 2) {
 1573         return false;
 1574       }
 1575       break;
 1576     case Op_FmaF:
 1577     case Op_FmaD:
 1578     case Op_FmaVD:
 1579     case Op_FmaVF:
 1580       if (!UseFMA) {
 1581         return false;
 1582       }
 1583       break;
 1584     case Op_MacroLogicV:
 1585       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1586         return false;
 1587       }
 1588       break;
 1589 
 1590     case Op_VectorCmpMasked:
 1591     case Op_VectorMaskGen:
 1592       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1593         return false;
 1594       }
 1595       break;
 1596     case Op_VectorMaskFirstTrue:
 1597     case Op_VectorMaskLastTrue:
 1598     case Op_VectorMaskTrueCount:
 1599     case Op_VectorMaskToLong:
 1600       if (!is_LP64 || UseAVX < 1) {
 1601          return false;
 1602       }
 1603       break;
 1604     case Op_RoundF:
 1605     case Op_RoundD:
 1606       if (!is_LP64) {
 1607         return false;
 1608       }
 1609       break;
 1610     case Op_CopySignD:
 1611     case Op_CopySignF:
 1612       if (UseAVX < 3 || !is_LP64)  {
 1613         return false;
 1614       }
 1615       if (!VM_Version::supports_avx512vl()) {
 1616         return false;
 1617       }
 1618       break;
 1619 #ifndef _LP64
 1620     case Op_AddReductionVF:
 1621     case Op_AddReductionVD:
 1622     case Op_MulReductionVF:
 1623     case Op_MulReductionVD:
 1624       if (UseSSE < 1) { // requires at least SSE
 1625         return false;
 1626       }
 1627       break;
 1628     case Op_MulAddVS2VI:
 1629     case Op_RShiftVL:
 1630     case Op_AbsVD:
 1631     case Op_NegVD:
 1632       if (UseSSE < 2) {
 1633         return false;
 1634       }
 1635       break;
 1636 #endif // !LP64
 1637     case Op_CompressBits:
 1638       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1639         return false;
 1640       }
 1641       break;
 1642     case Op_ExpandBits:
 1643       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1644         return false;
 1645       }
 1646       break;
 1647     case Op_SignumF:
 1648       if (UseSSE < 1) {
 1649         return false;
 1650       }
 1651       break;
 1652     case Op_SignumD:
 1653       if (UseSSE < 2) {
 1654         return false;
 1655       }
 1656       break;
 1657     case Op_CompressM:
 1658       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1659         return false;
 1660       }
 1661       break;
 1662     case Op_CompressV:
 1663     case Op_ExpandV:
 1664       if (!VM_Version::supports_avx512vl()) {
 1665         return false;
 1666       }
 1667       break;
 1668     case Op_SqrtF:
 1669       if (UseSSE < 1) {
 1670         return false;
 1671       }
 1672       break;
 1673     case Op_SqrtD:
 1674 #ifdef _LP64
 1675       if (UseSSE < 2) {
 1676         return false;
 1677       }
 1678 #else
 1679       // x86_32.ad has a special match rule for SqrtD.
 1680       // Together with common x86 rules, this handles all UseSSE cases.
 1681 #endif
 1682       break;
 1683     case Op_ConvF2HF:
 1684     case Op_ConvHF2F:
 1685       if (!VM_Version::supports_float16()) {
 1686         return false;
 1687       }
 1688       break;
 1689     case Op_VectorCastF2HF:
 1690     case Op_VectorCastHF2F:
 1691       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1692         return false;
 1693       }
 1694       break;
 1695   }
 1696   return true;  // Match rules are supported by default.
 1697 }
 1698 
 1699 //------------------------------------------------------------------------
 1700 
 1701 static inline bool is_pop_count_instr_target(BasicType bt) {
 1702   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1703          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1704 }
 1705 
 1706 bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) {
 1707   return match_rule_supported_vector(opcode, vlen, bt);
 1708 }
 1709 
 1710 // Identify extra cases that we might want to provide match rules for vector nodes and
 1711 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1712 bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1713   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1714   if (!match_rule_supported(opcode)) {
 1715     return false;
 1716   }
 1717   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1718   //   * SSE2 supports 128bit vectors for all types;
 1719   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1720   //   * AVX2 supports 256bit vectors for all types;
 1721   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1722   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1723   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1724   // And MaxVectorSize is taken into account as well.
 1725   if (!vector_size_supported(bt, vlen)) {
 1726     return false;
 1727   }
 1728   // Special cases which require vector length follow:
 1729   //   * implementation limitations
 1730   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1731   //   * 128bit vroundpd instruction is present only in AVX1
 1732   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1733   switch (opcode) {
 1734     case Op_AbsVF:
 1735     case Op_NegVF:
 1736       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1737         return false; // 512bit vandps and vxorps are not available
 1738       }
 1739       break;
 1740     case Op_AbsVD:
 1741     case Op_NegVD:
 1742       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1743         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1744       }
 1745       break;
 1746     case Op_RotateRightV:
 1747     case Op_RotateLeftV:
 1748       if (bt != T_INT && bt != T_LONG) {
 1749         return false;
 1750       } // fallthrough
 1751     case Op_MacroLogicV:
 1752       if (!VM_Version::supports_evex() ||
 1753           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1754         return false;
 1755       }
 1756       break;
 1757     case Op_ClearArray:
 1758     case Op_VectorMaskGen:
 1759     case Op_VectorCmpMasked:
 1760       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1761         return false;
 1762       }
 1763       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1764         return false;
 1765       }
 1766       break;
 1767     case Op_LoadVectorMasked:
 1768     case Op_StoreVectorMasked:
 1769       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1770         return false;
 1771       }
 1772       break;
 1773     case Op_MaxV:
 1774     case Op_MinV:
 1775       if (UseSSE < 4 && is_integral_type(bt)) {
 1776         return false;
 1777       }
 1778       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1779           // Float/Double intrinsics are enabled for AVX family currently.
 1780           if (UseAVX == 0) {
 1781             return false;
 1782           }
 1783           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1784             return false;
 1785           }
 1786       }
 1787       break;
 1788     case Op_CallLeafVector:
 1789       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1790         return false;
 1791       }
 1792       break;
 1793     case Op_AddReductionVI:
 1794       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1795         return false;
 1796       }
 1797       // fallthrough
 1798     case Op_AndReductionV:
 1799     case Op_OrReductionV:
 1800     case Op_XorReductionV:
 1801       if (is_subword_type(bt) && (UseSSE < 4)) {
 1802         return false;
 1803       }
 1804 #ifndef _LP64
 1805       if (bt == T_BYTE || bt == T_LONG) {
 1806         return false;
 1807       }
 1808 #endif
 1809       break;
 1810 #ifndef _LP64
 1811     case Op_VectorInsert:
 1812       if (bt == T_LONG || bt == T_DOUBLE) {
 1813         return false;
 1814       }
 1815       break;
 1816 #endif
 1817     case Op_MinReductionV:
 1818     case Op_MaxReductionV:
 1819       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1820         return false;
 1821       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1822         return false;
 1823       }
 1824       // Float/Double intrinsics enabled for AVX family.
 1825       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1826         return false;
 1827       }
 1828       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1829         return false;
 1830       }
 1831 #ifndef _LP64
 1832       if (bt == T_BYTE || bt == T_LONG) {
 1833         return false;
 1834       }
 1835 #endif
 1836       break;
 1837     case Op_VectorTest:
 1838       if (UseSSE < 4) {
 1839         return false; // Implementation limitation
 1840       } else if (size_in_bits < 32) {
 1841         return false; // Implementation limitation
 1842       }
 1843       break;
 1844     case Op_VectorLoadShuffle:
 1845     case Op_VectorRearrange:
 1846       if(vlen == 2) {
 1847         return false; // Implementation limitation due to how shuffle is loaded
 1848       } else if (size_in_bits == 256 && UseAVX < 2) {
 1849         return false; // Implementation limitation
 1850       }
 1851       break;
 1852     case Op_VectorLoadMask:
 1853     case Op_VectorMaskCast:
 1854       if (size_in_bits == 256 && UseAVX < 2) {
 1855         return false; // Implementation limitation
 1856       }
 1857       // fallthrough
 1858     case Op_VectorStoreMask:
 1859       if (vlen == 2) {
 1860         return false; // Implementation limitation
 1861       }
 1862       break;
 1863     case Op_PopulateIndex:
 1864       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1865         return false;
 1866       }
 1867       break;
 1868     case Op_VectorCastB2X:
 1869     case Op_VectorCastS2X:
 1870     case Op_VectorCastI2X:
 1871       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1872         return false;
 1873       }
 1874       break;
 1875     case Op_VectorCastL2X:
 1876       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1877         return false;
 1878       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1879         return false;
 1880       }
 1881       break;
 1882     case Op_VectorCastF2X: {
 1883         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1884         // happen after intermediate conversion to integer and special handling
 1885         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1886         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1887         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1888           return false;
 1889         }
 1890       }
 1891       // fallthrough
 1892     case Op_VectorCastD2X:
 1893       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1894         return false;
 1895       }
 1896       break;
 1897     case Op_VectorCastF2HF:
 1898     case Op_VectorCastHF2F:
 1899       if (!VM_Version::supports_f16c() &&
 1900          ((!VM_Version::supports_evex() ||
 1901          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1902         return false;
 1903       }
 1904       break;
 1905     case Op_RoundVD:
 1906       if (!VM_Version::supports_avx512dq()) {
 1907         return false;
 1908       }
 1909       break;
 1910     case Op_MulReductionVI:
 1911       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1912         return false;
 1913       }
 1914       break;
 1915     case Op_LoadVectorGatherMasked:
 1916     case Op_StoreVectorScatterMasked:
 1917     case Op_StoreVectorScatter:
 1918       if (is_subword_type(bt)) {
 1919         return false;
 1920       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1921         return false;
 1922       }
 1923       // fallthrough
 1924     case Op_LoadVectorGather:
 1925       if (size_in_bits == 64 ) {
 1926         return false;
 1927       }
 1928       break;
 1929     case Op_MaskAll:
 1930       if (!VM_Version::supports_evex()) {
 1931         return false;
 1932       }
 1933       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1934         return false;
 1935       }
 1936       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1937         return false;
 1938       }
 1939       break;
 1940     case Op_VectorMaskCmp:
 1941       if (vlen < 2 || size_in_bits < 32) {
 1942         return false;
 1943       }
 1944       break;
 1945     case Op_CompressM:
 1946       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1947         return false;
 1948       }
 1949       break;
 1950     case Op_CompressV:
 1951     case Op_ExpandV:
 1952       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1953         return false;
 1954       }
 1955       if (size_in_bits < 128 ) {
 1956         return false;
 1957       }
 1958       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1959         return false;
 1960       }
 1961       break;
 1962     case Op_VectorLongToMask:
 1963       if (UseAVX < 1 || !is_LP64) {
 1964         return false;
 1965       }
 1966       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1967         return false;
 1968       }
 1969       break;
 1970     case Op_SignumVD:
 1971     case Op_SignumVF:
 1972       if (UseAVX < 1) {
 1973         return false;
 1974       }
 1975       break;
 1976     case Op_PopCountVI:
 1977     case Op_PopCountVL: {
 1978         if (!is_pop_count_instr_target(bt) &&
 1979             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1980           return false;
 1981         }
 1982       }
 1983       break;
 1984     case Op_ReverseV:
 1985     case Op_ReverseBytesV:
 1986       if (UseAVX < 2) {
 1987         return false;
 1988       }
 1989       break;
 1990     case Op_CountTrailingZerosV:
 1991     case Op_CountLeadingZerosV:
 1992       if (UseAVX < 2) {
 1993         return false;
 1994       }
 1995       break;
 1996   }
 1997   return true;  // Per default match rules are supported.
 1998 }
 1999 
 2000 bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2001   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2002   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2003   // of their non-masked counterpart with mask edge being the differentiator.
 2004   // This routine does a strict check on the existence of masked operation patterns
 2005   // by returning a default false value for all the other opcodes apart from the
 2006   // ones whose masked instruction patterns are defined in this file.
 2007   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2008     return false;
 2009   }
 2010 
 2011   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2012   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2013   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2014     return false;
 2015   }
 2016   switch(opcode) {
 2017     // Unary masked operations
 2018     case Op_AbsVB:
 2019     case Op_AbsVS:
 2020       if(!VM_Version::supports_avx512bw()) {
 2021         return false;  // Implementation limitation
 2022       }
 2023     case Op_AbsVI:
 2024     case Op_AbsVL:
 2025       return true;
 2026 
 2027     // Ternary masked operations
 2028     case Op_FmaVF:
 2029     case Op_FmaVD:
 2030       return true;
 2031 
 2032     case Op_MacroLogicV:
 2033       if(bt != T_INT && bt != T_LONG) {
 2034         return false;
 2035       }
 2036       return true;
 2037 
 2038     // Binary masked operations
 2039     case Op_AddVB:
 2040     case Op_AddVS:
 2041     case Op_SubVB:
 2042     case Op_SubVS:
 2043     case Op_MulVS:
 2044     case Op_LShiftVS:
 2045     case Op_RShiftVS:
 2046     case Op_URShiftVS:
 2047       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2048       if (!VM_Version::supports_avx512bw()) {
 2049         return false;  // Implementation limitation
 2050       }
 2051       return true;
 2052 
 2053     case Op_MulVL:
 2054       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2055       if (!VM_Version::supports_avx512dq()) {
 2056         return false;  // Implementation limitation
 2057       }
 2058       return true;
 2059 
 2060     case Op_AndV:
 2061     case Op_OrV:
 2062     case Op_XorV:
 2063     case Op_RotateRightV:
 2064     case Op_RotateLeftV:
 2065       if (bt != T_INT && bt != T_LONG) {
 2066         return false; // Implementation limitation
 2067       }
 2068       return true;
 2069 
 2070     case Op_VectorLoadMask:
 2071       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2072       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2073         return false;
 2074       }
 2075       return true;
 2076 
 2077     case Op_AddVI:
 2078     case Op_AddVL:
 2079     case Op_AddVF:
 2080     case Op_AddVD:
 2081     case Op_SubVI:
 2082     case Op_SubVL:
 2083     case Op_SubVF:
 2084     case Op_SubVD:
 2085     case Op_MulVI:
 2086     case Op_MulVF:
 2087     case Op_MulVD:
 2088     case Op_DivVF:
 2089     case Op_DivVD:
 2090     case Op_SqrtVF:
 2091     case Op_SqrtVD:
 2092     case Op_LShiftVI:
 2093     case Op_LShiftVL:
 2094     case Op_RShiftVI:
 2095     case Op_RShiftVL:
 2096     case Op_URShiftVI:
 2097     case Op_URShiftVL:
 2098     case Op_LoadVectorMasked:
 2099     case Op_StoreVectorMasked:
 2100     case Op_LoadVectorGatherMasked:
 2101     case Op_StoreVectorScatterMasked:
 2102       return true;
 2103 
 2104     case Op_MaxV:
 2105     case Op_MinV:
 2106       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2107         return false; // Implementation limitation
 2108       }
 2109       if (is_floating_point_type(bt)) {
 2110         return false; // Implementation limitation
 2111       }
 2112       return true;
 2113 
 2114     case Op_VectorMaskCmp:
 2115       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2116         return false; // Implementation limitation
 2117       }
 2118       return true;
 2119 
 2120     case Op_VectorRearrange:
 2121       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2122         return false; // Implementation limitation
 2123       }
 2124       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2125         return false; // Implementation limitation
 2126       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2127         return false; // Implementation limitation
 2128       }
 2129       return true;
 2130 
 2131     // Binary Logical operations
 2132     case Op_AndVMask:
 2133     case Op_OrVMask:
 2134     case Op_XorVMask:
 2135       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2136         return false; // Implementation limitation
 2137       }
 2138       return true;
 2139 
 2140     case Op_PopCountVI:
 2141     case Op_PopCountVL:
 2142       if (!is_pop_count_instr_target(bt)) {
 2143         return false;
 2144       }
 2145       return true;
 2146 
 2147     case Op_MaskAll:
 2148       return true;
 2149 
 2150     case Op_CountLeadingZerosV:
 2151       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2152         return true;
 2153       }
 2154     default:
 2155       return false;
 2156   }
 2157 }
 2158 
 2159 bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2160   return false;
 2161 }
 2162 
 2163 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2164   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2165   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2166   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2167       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2168     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2169     return new legVecZOper();
 2170   }
 2171   if (legacy) {
 2172     switch (ideal_reg) {
 2173       case Op_VecS: return new legVecSOper();
 2174       case Op_VecD: return new legVecDOper();
 2175       case Op_VecX: return new legVecXOper();
 2176       case Op_VecY: return new legVecYOper();
 2177       case Op_VecZ: return new legVecZOper();
 2178     }
 2179   } else {
 2180     switch (ideal_reg) {
 2181       case Op_VecS: return new vecSOper();
 2182       case Op_VecD: return new vecDOper();
 2183       case Op_VecX: return new vecXOper();
 2184       case Op_VecY: return new vecYOper();
 2185       case Op_VecZ: return new vecZOper();
 2186     }
 2187   }
 2188   ShouldNotReachHere();
 2189   return NULL;
 2190 }
 2191 
 2192 bool Matcher::is_reg2reg_move(MachNode* m) {
 2193   switch (m->rule()) {
 2194     case MoveVec2Leg_rule:
 2195     case MoveLeg2Vec_rule:
 2196     case MoveF2VL_rule:
 2197     case MoveF2LEG_rule:
 2198     case MoveVL2F_rule:
 2199     case MoveLEG2F_rule:
 2200     case MoveD2VL_rule:
 2201     case MoveD2LEG_rule:
 2202     case MoveVL2D_rule:
 2203     case MoveLEG2D_rule:
 2204       return true;
 2205     default:
 2206       return false;
 2207   }
 2208 }
 2209 
 2210 bool Matcher::is_generic_vector(MachOper* opnd) {
 2211   switch (opnd->opcode()) {
 2212     case VEC:
 2213     case LEGVEC:
 2214       return true;
 2215     default:
 2216       return false;
 2217   }
 2218 }
 2219 
 2220 //------------------------------------------------------------------------
 2221 
 2222 const RegMask* Matcher::predicate_reg_mask(void) {
 2223   return &_VECTMASK_REG_mask;
 2224 }
 2225 
 2226 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2227   return new TypeVectMask(elemTy, length);
 2228 }
 2229 
 2230 // Max vector size in bytes. 0 if not supported.
 2231 int Matcher::vector_width_in_bytes(BasicType bt) {
 2232   assert(is_java_primitive(bt), "only primitive type vectors");
 2233   if (UseSSE < 2) return 0;
 2234   // SSE2 supports 128bit vectors for all types.
 2235   // AVX2 supports 256bit vectors for all types.
 2236   // AVX2/EVEX supports 512bit vectors for all types.
 2237   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2238   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2239   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2240     size = (UseAVX > 2) ? 64 : 32;
 2241   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2242     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2243   // Use flag to limit vector size.
 2244   size = MIN2(size,(int)MaxVectorSize);
 2245   // Minimum 2 values in vector (or 4 for bytes).
 2246   switch (bt) {
 2247   case T_DOUBLE:
 2248   case T_LONG:
 2249     if (size < 16) return 0;
 2250     break;
 2251   case T_FLOAT:
 2252   case T_INT:
 2253     if (size < 8) return 0;
 2254     break;
 2255   case T_BOOLEAN:
 2256     if (size < 4) return 0;
 2257     break;
 2258   case T_CHAR:
 2259     if (size < 4) return 0;
 2260     break;
 2261   case T_BYTE:
 2262     if (size < 4) return 0;
 2263     break;
 2264   case T_SHORT:
 2265     if (size < 4) return 0;
 2266     break;
 2267   default:
 2268     ShouldNotReachHere();
 2269   }
 2270   return size;
 2271 }
 2272 
 2273 // Limits on vector size (number of elements) loaded into vector.
 2274 int Matcher::max_vector_size(const BasicType bt) {
 2275   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2276 }
 2277 int Matcher::min_vector_size(const BasicType bt) {
 2278   int max_size = max_vector_size(bt);
 2279   // Min size which can be loaded into vector is 4 bytes.
 2280   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2281   // Support for calling svml double64 vectors
 2282   if (bt == T_DOUBLE) {
 2283     size = 1;
 2284   }
 2285   return MIN2(size,max_size);
 2286 }
 2287 
 2288 int Matcher::superword_max_vector_size(const BasicType bt) {
 2289   // Limit the max vector size for auto vectorization to 256 bits (32 bytes)
 2290   // by default on Cascade Lake
 2291   if (VM_Version::is_default_intel_cascade_lake()) {
 2292     return MIN2(Matcher::max_vector_size(bt), 32 / type2aelembytes(bt));
 2293   }
 2294   return Matcher::max_vector_size(bt);
 2295 }
 2296 
 2297 int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2298   return -1;
 2299 }
 2300 
 2301 // Vector ideal reg corresponding to specified size in bytes
 2302 uint Matcher::vector_ideal_reg(int size) {
 2303   assert(MaxVectorSize >= size, "");
 2304   switch(size) {
 2305     case  4: return Op_VecS;
 2306     case  8: return Op_VecD;
 2307     case 16: return Op_VecX;
 2308     case 32: return Op_VecY;
 2309     case 64: return Op_VecZ;
 2310   }
 2311   ShouldNotReachHere();
 2312   return 0;
 2313 }
 2314 
 2315 // Check for shift by small constant as well
 2316 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2317   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2318       shift->in(2)->get_int() <= 3 &&
 2319       // Are there other uses besides address expressions?
 2320       !matcher->is_visited(shift)) {
 2321     address_visited.set(shift->_idx); // Flag as address_visited
 2322     mstack.push(shift->in(2), Matcher::Visit);
 2323     Node *conv = shift->in(1);
 2324 #ifdef _LP64
 2325     // Allow Matcher to match the rule which bypass
 2326     // ConvI2L operation for an array index on LP64
 2327     // if the index value is positive.
 2328     if (conv->Opcode() == Op_ConvI2L &&
 2329         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2330         // Are there other uses besides address expressions?
 2331         !matcher->is_visited(conv)) {
 2332       address_visited.set(conv->_idx); // Flag as address_visited
 2333       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2334     } else
 2335 #endif
 2336       mstack.push(conv, Matcher::Pre_Visit);
 2337     return true;
 2338   }
 2339   return false;
 2340 }
 2341 
 2342 // This function identifies sub-graphs in which a 'load' node is
 2343 // input to two different nodes, and such that it can be matched
 2344 // with BMI instructions like blsi, blsr, etc.
 2345 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2346 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2347 // refers to the same node.
 2348 //
 2349 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2350 // This is a temporary solution until we make DAGs expressible in ADL.
 2351 template<typename ConType>
 2352 class FusedPatternMatcher {
 2353   Node* _op1_node;
 2354   Node* _mop_node;
 2355   int _con_op;
 2356 
 2357   static int match_next(Node* n, int next_op, int next_op_idx) {
 2358     if (n->in(1) == NULL || n->in(2) == NULL) {
 2359       return -1;
 2360     }
 2361 
 2362     if (next_op_idx == -1) { // n is commutative, try rotations
 2363       if (n->in(1)->Opcode() == next_op) {
 2364         return 1;
 2365       } else if (n->in(2)->Opcode() == next_op) {
 2366         return 2;
 2367       }
 2368     } else {
 2369       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2370       if (n->in(next_op_idx)->Opcode() == next_op) {
 2371         return next_op_idx;
 2372       }
 2373     }
 2374     return -1;
 2375   }
 2376 
 2377  public:
 2378   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2379     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2380 
 2381   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2382              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2383              typename ConType::NativeType con_value) {
 2384     if (_op1_node->Opcode() != op1) {
 2385       return false;
 2386     }
 2387     if (_mop_node->outcnt() > 2) {
 2388       return false;
 2389     }
 2390     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2391     if (op1_op2_idx == -1) {
 2392       return false;
 2393     }
 2394     // Memory operation must be the other edge
 2395     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2396 
 2397     // Check that the mop node is really what we want
 2398     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2399       Node* op2_node = _op1_node->in(op1_op2_idx);
 2400       if (op2_node->outcnt() > 1) {
 2401         return false;
 2402       }
 2403       assert(op2_node->Opcode() == op2, "Should be");
 2404       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2405       if (op2_con_idx == -1) {
 2406         return false;
 2407       }
 2408       // Memory operation must be the other edge
 2409       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2410       // Check that the memory operation is the same node
 2411       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2412         // Now check the constant
 2413         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2414         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2415           return true;
 2416         }
 2417       }
 2418     }
 2419     return false;
 2420   }
 2421 };
 2422 
 2423 static bool is_bmi_pattern(Node* n, Node* m) {
 2424   assert(UseBMI1Instructions, "sanity");
 2425   if (n != NULL && m != NULL) {
 2426     if (m->Opcode() == Op_LoadI) {
 2427       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2428       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2429              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2430              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2431     } else if (m->Opcode() == Op_LoadL) {
 2432       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2433       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2434              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2435              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2436     }
 2437   }
 2438   return false;
 2439 }
 2440 
 2441 // Should the matcher clone input 'm' of node 'n'?
 2442 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2443   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2444   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2445     mstack.push(m, Visit);
 2446     return true;
 2447   }
 2448   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2449     mstack.push(m, Visit);           // m = ShiftCntV
 2450     return true;
 2451   }
 2452   return false;
 2453 }
 2454 
 2455 // Should the Matcher clone shifts on addressing modes, expecting them
 2456 // to be subsumed into complex addressing expressions or compute them
 2457 // into registers?
 2458 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2459   Node *off = m->in(AddPNode::Offset);
 2460   if (off->is_Con()) {
 2461     address_visited.test_set(m->_idx); // Flag as address_visited
 2462     Node *adr = m->in(AddPNode::Address);
 2463 
 2464     // Intel can handle 2 adds in addressing mode
 2465     // AtomicAdd is not an addressing expression.
 2466     // Cheap to find it by looking for screwy base.
 2467     if (adr->is_AddP() &&
 2468         !adr->in(AddPNode::Base)->is_top() &&
 2469         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2470         // Are there other uses besides address expressions?
 2471         !is_visited(adr)) {
 2472       address_visited.set(adr->_idx); // Flag as address_visited
 2473       Node *shift = adr->in(AddPNode::Offset);
 2474       if (!clone_shift(shift, this, mstack, address_visited)) {
 2475         mstack.push(shift, Pre_Visit);
 2476       }
 2477       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2478       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2479     } else {
 2480       mstack.push(adr, Pre_Visit);
 2481     }
 2482 
 2483     // Clone X+offset as it also folds into most addressing expressions
 2484     mstack.push(off, Visit);
 2485     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2486     return true;
 2487   } else if (clone_shift(off, this, mstack, address_visited)) {
 2488     address_visited.test_set(m->_idx); // Flag as address_visited
 2489     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2490     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2491     return true;
 2492   }
 2493   return false;
 2494 }
 2495 
 2496 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2497   switch (bt) {
 2498     case BoolTest::eq:
 2499       return Assembler::eq;
 2500     case BoolTest::ne:
 2501       return Assembler::neq;
 2502     case BoolTest::le:
 2503     case BoolTest::ule:
 2504       return Assembler::le;
 2505     case BoolTest::ge:
 2506     case BoolTest::uge:
 2507       return Assembler::nlt;
 2508     case BoolTest::lt:
 2509     case BoolTest::ult:
 2510       return Assembler::lt;
 2511     case BoolTest::gt:
 2512     case BoolTest::ugt:
 2513       return Assembler::nle;
 2514     default : ShouldNotReachHere(); return Assembler::_false;
 2515   }
 2516 }
 2517 
 2518 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2519   switch (bt) {
 2520   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2521   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2522   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2523   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2524   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2525   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2526   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2527   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2528   }
 2529 }
 2530 
 2531 // Helper methods for MachSpillCopyNode::implementation().
 2532 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2533                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2534   assert(ireg == Op_VecS || // 32bit vector
 2535          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2536          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2537          "no non-adjacent vector moves" );
 2538   if (cbuf) {
 2539     C2_MacroAssembler _masm(cbuf);
 2540     switch (ireg) {
 2541     case Op_VecS: // copy whole register
 2542     case Op_VecD:
 2543     case Op_VecX:
 2544 #ifndef _LP64
 2545       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2546 #else
 2547       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2548         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2549       } else {
 2550         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2551      }
 2552 #endif
 2553       break;
 2554     case Op_VecY:
 2555 #ifndef _LP64
 2556       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2557 #else
 2558       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2559         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2560       } else {
 2561         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2562      }
 2563 #endif
 2564       break;
 2565     case Op_VecZ:
 2566       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2567       break;
 2568     default:
 2569       ShouldNotReachHere();
 2570     }
 2571 #ifndef PRODUCT
 2572   } else {
 2573     switch (ireg) {
 2574     case Op_VecS:
 2575     case Op_VecD:
 2576     case Op_VecX:
 2577       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2578       break;
 2579     case Op_VecY:
 2580     case Op_VecZ:
 2581       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2582       break;
 2583     default:
 2584       ShouldNotReachHere();
 2585     }
 2586 #endif
 2587   }
 2588 }
 2589 
 2590 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2591                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2592   if (cbuf) {
 2593     C2_MacroAssembler _masm(cbuf);
 2594     if (is_load) {
 2595       switch (ireg) {
 2596       case Op_VecS:
 2597         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2598         break;
 2599       case Op_VecD:
 2600         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2601         break;
 2602       case Op_VecX:
 2603 #ifndef _LP64
 2604         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2605 #else
 2606         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2607           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2608         } else {
 2609           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2610           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2611         }
 2612 #endif
 2613         break;
 2614       case Op_VecY:
 2615 #ifndef _LP64
 2616         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2617 #else
 2618         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2619           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2620         } else {
 2621           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2622           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2623         }
 2624 #endif
 2625         break;
 2626       case Op_VecZ:
 2627         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2628         break;
 2629       default:
 2630         ShouldNotReachHere();
 2631       }
 2632     } else { // store
 2633       switch (ireg) {
 2634       case Op_VecS:
 2635         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2636         break;
 2637       case Op_VecD:
 2638         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2639         break;
 2640       case Op_VecX:
 2641 #ifndef _LP64
 2642         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2643 #else
 2644         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2645           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2646         }
 2647         else {
 2648           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2649         }
 2650 #endif
 2651         break;
 2652       case Op_VecY:
 2653 #ifndef _LP64
 2654         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2655 #else
 2656         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2657           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2658         }
 2659         else {
 2660           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2661         }
 2662 #endif
 2663         break;
 2664       case Op_VecZ:
 2665         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2666         break;
 2667       default:
 2668         ShouldNotReachHere();
 2669       }
 2670     }
 2671 #ifndef PRODUCT
 2672   } else {
 2673     if (is_load) {
 2674       switch (ireg) {
 2675       case Op_VecS:
 2676         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2677         break;
 2678       case Op_VecD:
 2679         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2680         break;
 2681        case Op_VecX:
 2682         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2683         break;
 2684       case Op_VecY:
 2685       case Op_VecZ:
 2686         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2687         break;
 2688       default:
 2689         ShouldNotReachHere();
 2690       }
 2691     } else { // store
 2692       switch (ireg) {
 2693       case Op_VecS:
 2694         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2695         break;
 2696       case Op_VecD:
 2697         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2698         break;
 2699        case Op_VecX:
 2700         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2701         break;
 2702       case Op_VecY:
 2703       case Op_VecZ:
 2704         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2705         break;
 2706       default:
 2707         ShouldNotReachHere();
 2708       }
 2709     }
 2710 #endif
 2711   }
 2712 }
 2713 
 2714 template <class T>
 2715 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2716   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2717   jvalue ele;
 2718   switch (bt) {
 2719     case T_BYTE:   ele.b = con; break;
 2720     case T_SHORT:  ele.s = con; break;
 2721     case T_INT:    ele.i = con; break;
 2722     case T_LONG:   ele.j = con; break;
 2723     case T_FLOAT:  ele.f = con; break;
 2724     case T_DOUBLE: ele.d = con; break;
 2725     default: ShouldNotReachHere();
 2726   }
 2727   for (int i = 0; i < len; i++) {
 2728     val->append(ele);
 2729   }
 2730   return val;
 2731 }
 2732 
 2733 static inline jlong high_bit_set(BasicType bt) {
 2734   switch (bt) {
 2735     case T_BYTE:  return 0x8080808080808080;
 2736     case T_SHORT: return 0x8000800080008000;
 2737     case T_INT:   return 0x8000000080000000;
 2738     case T_LONG:  return 0x8000000000000000;
 2739     default:
 2740       ShouldNotReachHere();
 2741       return 0;
 2742   }
 2743 }
 2744 
 2745 #ifndef PRODUCT
 2746   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2747     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2748   }
 2749 #endif
 2750 
 2751   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2752     C2_MacroAssembler _masm(&cbuf);
 2753     __ nop(_count);
 2754   }
 2755 
 2756   uint MachNopNode::size(PhaseRegAlloc*) const {
 2757     return _count;
 2758   }
 2759 
 2760 #ifndef PRODUCT
 2761   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2762     st->print("# breakpoint");
 2763   }
 2764 #endif
 2765 
 2766   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2767     C2_MacroAssembler _masm(&cbuf);
 2768     __ int3();
 2769   }
 2770 
 2771   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2772     return MachNode::size(ra_);
 2773   }
 2774 
 2775 %}
 2776 
 2777 encode %{
 2778 
 2779   enc_class call_epilog %{
 2780     C2_MacroAssembler _masm(&cbuf);
 2781     if (VerifyStackAtCalls) {
 2782       // Check that stack depth is unchanged: find majik cookie on stack
 2783       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2784       Label L;
 2785       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2786       __ jccb(Assembler::equal, L);
 2787       // Die if stack mismatch
 2788       __ int3();
 2789       __ bind(L);
 2790     }
 2791   %}
 2792 
 2793 %}
 2794 
 2795 // Operands for bound floating pointer register arguments
 2796 operand rxmm0() %{
 2797   constraint(ALLOC_IN_RC(xmm0_reg));
 2798   match(VecX);
 2799   format%{%}
 2800   interface(REG_INTER);
 2801 %}
 2802 
 2803 //----------OPERANDS-----------------------------------------------------------
 2804 // Operand definitions must precede instruction definitions for correct parsing
 2805 // in the ADLC because operands constitute user defined types which are used in
 2806 // instruction definitions.
 2807 
 2808 // Vectors
 2809 
 2810 // Dummy generic vector class. Should be used for all vector operands.
 2811 // Replaced with vec[SDXYZ] during post-selection pass.
 2812 operand vec() %{
 2813   constraint(ALLOC_IN_RC(dynamic));
 2814   match(VecX);
 2815   match(VecY);
 2816   match(VecZ);
 2817   match(VecS);
 2818   match(VecD);
 2819 
 2820   format %{ %}
 2821   interface(REG_INTER);
 2822 %}
 2823 
 2824 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2825 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2826 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2827 // runtime code generation via reg_class_dynamic.
 2828 operand legVec() %{
 2829   constraint(ALLOC_IN_RC(dynamic));
 2830   match(VecX);
 2831   match(VecY);
 2832   match(VecZ);
 2833   match(VecS);
 2834   match(VecD);
 2835 
 2836   format %{ %}
 2837   interface(REG_INTER);
 2838 %}
 2839 
 2840 // Replaces vec during post-selection cleanup. See above.
 2841 operand vecS() %{
 2842   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2843   match(VecS);
 2844 
 2845   format %{ %}
 2846   interface(REG_INTER);
 2847 %}
 2848 
 2849 // Replaces legVec during post-selection cleanup. See above.
 2850 operand legVecS() %{
 2851   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2852   match(VecS);
 2853 
 2854   format %{ %}
 2855   interface(REG_INTER);
 2856 %}
 2857 
 2858 // Replaces vec during post-selection cleanup. See above.
 2859 operand vecD() %{
 2860   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2861   match(VecD);
 2862 
 2863   format %{ %}
 2864   interface(REG_INTER);
 2865 %}
 2866 
 2867 // Replaces legVec during post-selection cleanup. See above.
 2868 operand legVecD() %{
 2869   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2870   match(VecD);
 2871 
 2872   format %{ %}
 2873   interface(REG_INTER);
 2874 %}
 2875 
 2876 // Replaces vec during post-selection cleanup. See above.
 2877 operand vecX() %{
 2878   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2879   match(VecX);
 2880 
 2881   format %{ %}
 2882   interface(REG_INTER);
 2883 %}
 2884 
 2885 // Replaces legVec during post-selection cleanup. See above.
 2886 operand legVecX() %{
 2887   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2888   match(VecX);
 2889 
 2890   format %{ %}
 2891   interface(REG_INTER);
 2892 %}
 2893 
 2894 // Replaces vec during post-selection cleanup. See above.
 2895 operand vecY() %{
 2896   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2897   match(VecY);
 2898 
 2899   format %{ %}
 2900   interface(REG_INTER);
 2901 %}
 2902 
 2903 // Replaces legVec during post-selection cleanup. See above.
 2904 operand legVecY() %{
 2905   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2906   match(VecY);
 2907 
 2908   format %{ %}
 2909   interface(REG_INTER);
 2910 %}
 2911 
 2912 // Replaces vec during post-selection cleanup. See above.
 2913 operand vecZ() %{
 2914   constraint(ALLOC_IN_RC(vectorz_reg));
 2915   match(VecZ);
 2916 
 2917   format %{ %}
 2918   interface(REG_INTER);
 2919 %}
 2920 
 2921 // Replaces legVec during post-selection cleanup. See above.
 2922 operand legVecZ() %{
 2923   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2924   match(VecZ);
 2925 
 2926   format %{ %}
 2927   interface(REG_INTER);
 2928 %}
 2929 
 2930 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2931 
 2932 // ============================================================================
 2933 
 2934 instruct ShouldNotReachHere() %{
 2935   match(Halt);
 2936   format %{ "stop\t# ShouldNotReachHere" %}
 2937   ins_encode %{
 2938     if (is_reachable()) {
 2939       __ stop(_halt_reason);
 2940     }
 2941   %}
 2942   ins_pipe(pipe_slow);
 2943 %}
 2944 
 2945 // ============================================================================
 2946 
 2947 instruct addF_reg(regF dst, regF src) %{
 2948   predicate((UseSSE>=1) && (UseAVX == 0));
 2949   match(Set dst (AddF dst src));
 2950 
 2951   format %{ "addss   $dst, $src" %}
 2952   ins_cost(150);
 2953   ins_encode %{
 2954     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2955   %}
 2956   ins_pipe(pipe_slow);
 2957 %}
 2958 
 2959 instruct addF_mem(regF dst, memory src) %{
 2960   predicate((UseSSE>=1) && (UseAVX == 0));
 2961   match(Set dst (AddF dst (LoadF src)));
 2962 
 2963   format %{ "addss   $dst, $src" %}
 2964   ins_cost(150);
 2965   ins_encode %{
 2966     __ addss($dst$$XMMRegister, $src$$Address);
 2967   %}
 2968   ins_pipe(pipe_slow);
 2969 %}
 2970 
 2971 instruct addF_imm(regF dst, immF con) %{
 2972   predicate((UseSSE>=1) && (UseAVX == 0));
 2973   match(Set dst (AddF dst con));
 2974   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2975   ins_cost(150);
 2976   ins_encode %{
 2977     __ addss($dst$$XMMRegister, $constantaddress($con));
 2978   %}
 2979   ins_pipe(pipe_slow);
 2980 %}
 2981 
 2982 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2983   predicate(UseAVX > 0);
 2984   match(Set dst (AddF src1 src2));
 2985 
 2986   format %{ "vaddss  $dst, $src1, $src2" %}
 2987   ins_cost(150);
 2988   ins_encode %{
 2989     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2990   %}
 2991   ins_pipe(pipe_slow);
 2992 %}
 2993 
 2994 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2995   predicate(UseAVX > 0);
 2996   match(Set dst (AddF src1 (LoadF src2)));
 2997 
 2998   format %{ "vaddss  $dst, $src1, $src2" %}
 2999   ins_cost(150);
 3000   ins_encode %{
 3001     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3002   %}
 3003   ins_pipe(pipe_slow);
 3004 %}
 3005 
 3006 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3007   predicate(UseAVX > 0);
 3008   match(Set dst (AddF src con));
 3009 
 3010   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3011   ins_cost(150);
 3012   ins_encode %{
 3013     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3014   %}
 3015   ins_pipe(pipe_slow);
 3016 %}
 3017 
 3018 instruct addD_reg(regD dst, regD src) %{
 3019   predicate((UseSSE>=2) && (UseAVX == 0));
 3020   match(Set dst (AddD dst src));
 3021 
 3022   format %{ "addsd   $dst, $src" %}
 3023   ins_cost(150);
 3024   ins_encode %{
 3025     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3026   %}
 3027   ins_pipe(pipe_slow);
 3028 %}
 3029 
 3030 instruct addD_mem(regD dst, memory src) %{
 3031   predicate((UseSSE>=2) && (UseAVX == 0));
 3032   match(Set dst (AddD dst (LoadD src)));
 3033 
 3034   format %{ "addsd   $dst, $src" %}
 3035   ins_cost(150);
 3036   ins_encode %{
 3037     __ addsd($dst$$XMMRegister, $src$$Address);
 3038   %}
 3039   ins_pipe(pipe_slow);
 3040 %}
 3041 
 3042 instruct addD_imm(regD dst, immD con) %{
 3043   predicate((UseSSE>=2) && (UseAVX == 0));
 3044   match(Set dst (AddD dst con));
 3045   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3046   ins_cost(150);
 3047   ins_encode %{
 3048     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3049   %}
 3050   ins_pipe(pipe_slow);
 3051 %}
 3052 
 3053 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3054   predicate(UseAVX > 0);
 3055   match(Set dst (AddD src1 src2));
 3056 
 3057   format %{ "vaddsd  $dst, $src1, $src2" %}
 3058   ins_cost(150);
 3059   ins_encode %{
 3060     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3061   %}
 3062   ins_pipe(pipe_slow);
 3063 %}
 3064 
 3065 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3066   predicate(UseAVX > 0);
 3067   match(Set dst (AddD src1 (LoadD src2)));
 3068 
 3069   format %{ "vaddsd  $dst, $src1, $src2" %}
 3070   ins_cost(150);
 3071   ins_encode %{
 3072     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3073   %}
 3074   ins_pipe(pipe_slow);
 3075 %}
 3076 
 3077 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3078   predicate(UseAVX > 0);
 3079   match(Set dst (AddD src con));
 3080 
 3081   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3082   ins_cost(150);
 3083   ins_encode %{
 3084     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3085   %}
 3086   ins_pipe(pipe_slow);
 3087 %}
 3088 
 3089 instruct subF_reg(regF dst, regF src) %{
 3090   predicate((UseSSE>=1) && (UseAVX == 0));
 3091   match(Set dst (SubF dst src));
 3092 
 3093   format %{ "subss   $dst, $src" %}
 3094   ins_cost(150);
 3095   ins_encode %{
 3096     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3097   %}
 3098   ins_pipe(pipe_slow);
 3099 %}
 3100 
 3101 instruct subF_mem(regF dst, memory src) %{
 3102   predicate((UseSSE>=1) && (UseAVX == 0));
 3103   match(Set dst (SubF dst (LoadF src)));
 3104 
 3105   format %{ "subss   $dst, $src" %}
 3106   ins_cost(150);
 3107   ins_encode %{
 3108     __ subss($dst$$XMMRegister, $src$$Address);
 3109   %}
 3110   ins_pipe(pipe_slow);
 3111 %}
 3112 
 3113 instruct subF_imm(regF dst, immF con) %{
 3114   predicate((UseSSE>=1) && (UseAVX == 0));
 3115   match(Set dst (SubF dst con));
 3116   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3117   ins_cost(150);
 3118   ins_encode %{
 3119     __ subss($dst$$XMMRegister, $constantaddress($con));
 3120   %}
 3121   ins_pipe(pipe_slow);
 3122 %}
 3123 
 3124 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3125   predicate(UseAVX > 0);
 3126   match(Set dst (SubF src1 src2));
 3127 
 3128   format %{ "vsubss  $dst, $src1, $src2" %}
 3129   ins_cost(150);
 3130   ins_encode %{
 3131     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3132   %}
 3133   ins_pipe(pipe_slow);
 3134 %}
 3135 
 3136 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3137   predicate(UseAVX > 0);
 3138   match(Set dst (SubF src1 (LoadF src2)));
 3139 
 3140   format %{ "vsubss  $dst, $src1, $src2" %}
 3141   ins_cost(150);
 3142   ins_encode %{
 3143     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3144   %}
 3145   ins_pipe(pipe_slow);
 3146 %}
 3147 
 3148 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3149   predicate(UseAVX > 0);
 3150   match(Set dst (SubF src con));
 3151 
 3152   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3153   ins_cost(150);
 3154   ins_encode %{
 3155     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3156   %}
 3157   ins_pipe(pipe_slow);
 3158 %}
 3159 
 3160 instruct subD_reg(regD dst, regD src) %{
 3161   predicate((UseSSE>=2) && (UseAVX == 0));
 3162   match(Set dst (SubD dst src));
 3163 
 3164   format %{ "subsd   $dst, $src" %}
 3165   ins_cost(150);
 3166   ins_encode %{
 3167     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3168   %}
 3169   ins_pipe(pipe_slow);
 3170 %}
 3171 
 3172 instruct subD_mem(regD dst, memory src) %{
 3173   predicate((UseSSE>=2) && (UseAVX == 0));
 3174   match(Set dst (SubD dst (LoadD src)));
 3175 
 3176   format %{ "subsd   $dst, $src" %}
 3177   ins_cost(150);
 3178   ins_encode %{
 3179     __ subsd($dst$$XMMRegister, $src$$Address);
 3180   %}
 3181   ins_pipe(pipe_slow);
 3182 %}
 3183 
 3184 instruct subD_imm(regD dst, immD con) %{
 3185   predicate((UseSSE>=2) && (UseAVX == 0));
 3186   match(Set dst (SubD dst con));
 3187   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3188   ins_cost(150);
 3189   ins_encode %{
 3190     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3191   %}
 3192   ins_pipe(pipe_slow);
 3193 %}
 3194 
 3195 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3196   predicate(UseAVX > 0);
 3197   match(Set dst (SubD src1 src2));
 3198 
 3199   format %{ "vsubsd  $dst, $src1, $src2" %}
 3200   ins_cost(150);
 3201   ins_encode %{
 3202     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3203   %}
 3204   ins_pipe(pipe_slow);
 3205 %}
 3206 
 3207 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3208   predicate(UseAVX > 0);
 3209   match(Set dst (SubD src1 (LoadD src2)));
 3210 
 3211   format %{ "vsubsd  $dst, $src1, $src2" %}
 3212   ins_cost(150);
 3213   ins_encode %{
 3214     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3215   %}
 3216   ins_pipe(pipe_slow);
 3217 %}
 3218 
 3219 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3220   predicate(UseAVX > 0);
 3221   match(Set dst (SubD src con));
 3222 
 3223   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3224   ins_cost(150);
 3225   ins_encode %{
 3226     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3227   %}
 3228   ins_pipe(pipe_slow);
 3229 %}
 3230 
 3231 instruct mulF_reg(regF dst, regF src) %{
 3232   predicate((UseSSE>=1) && (UseAVX == 0));
 3233   match(Set dst (MulF dst src));
 3234 
 3235   format %{ "mulss   $dst, $src" %}
 3236   ins_cost(150);
 3237   ins_encode %{
 3238     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3239   %}
 3240   ins_pipe(pipe_slow);
 3241 %}
 3242 
 3243 instruct mulF_mem(regF dst, memory src) %{
 3244   predicate((UseSSE>=1) && (UseAVX == 0));
 3245   match(Set dst (MulF dst (LoadF src)));
 3246 
 3247   format %{ "mulss   $dst, $src" %}
 3248   ins_cost(150);
 3249   ins_encode %{
 3250     __ mulss($dst$$XMMRegister, $src$$Address);
 3251   %}
 3252   ins_pipe(pipe_slow);
 3253 %}
 3254 
 3255 instruct mulF_imm(regF dst, immF con) %{
 3256   predicate((UseSSE>=1) && (UseAVX == 0));
 3257   match(Set dst (MulF dst con));
 3258   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3259   ins_cost(150);
 3260   ins_encode %{
 3261     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3262   %}
 3263   ins_pipe(pipe_slow);
 3264 %}
 3265 
 3266 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3267   predicate(UseAVX > 0);
 3268   match(Set dst (MulF src1 src2));
 3269 
 3270   format %{ "vmulss  $dst, $src1, $src2" %}
 3271   ins_cost(150);
 3272   ins_encode %{
 3273     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3274   %}
 3275   ins_pipe(pipe_slow);
 3276 %}
 3277 
 3278 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3279   predicate(UseAVX > 0);
 3280   match(Set dst (MulF src1 (LoadF src2)));
 3281 
 3282   format %{ "vmulss  $dst, $src1, $src2" %}
 3283   ins_cost(150);
 3284   ins_encode %{
 3285     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3286   %}
 3287   ins_pipe(pipe_slow);
 3288 %}
 3289 
 3290 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3291   predicate(UseAVX > 0);
 3292   match(Set dst (MulF src con));
 3293 
 3294   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3295   ins_cost(150);
 3296   ins_encode %{
 3297     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3298   %}
 3299   ins_pipe(pipe_slow);
 3300 %}
 3301 
 3302 instruct mulD_reg(regD dst, regD src) %{
 3303   predicate((UseSSE>=2) && (UseAVX == 0));
 3304   match(Set dst (MulD dst src));
 3305 
 3306   format %{ "mulsd   $dst, $src" %}
 3307   ins_cost(150);
 3308   ins_encode %{
 3309     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3310   %}
 3311   ins_pipe(pipe_slow);
 3312 %}
 3313 
 3314 instruct mulD_mem(regD dst, memory src) %{
 3315   predicate((UseSSE>=2) && (UseAVX == 0));
 3316   match(Set dst (MulD dst (LoadD src)));
 3317 
 3318   format %{ "mulsd   $dst, $src" %}
 3319   ins_cost(150);
 3320   ins_encode %{
 3321     __ mulsd($dst$$XMMRegister, $src$$Address);
 3322   %}
 3323   ins_pipe(pipe_slow);
 3324 %}
 3325 
 3326 instruct mulD_imm(regD dst, immD con) %{
 3327   predicate((UseSSE>=2) && (UseAVX == 0));
 3328   match(Set dst (MulD dst con));
 3329   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3330   ins_cost(150);
 3331   ins_encode %{
 3332     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3333   %}
 3334   ins_pipe(pipe_slow);
 3335 %}
 3336 
 3337 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3338   predicate(UseAVX > 0);
 3339   match(Set dst (MulD src1 src2));
 3340 
 3341   format %{ "vmulsd  $dst, $src1, $src2" %}
 3342   ins_cost(150);
 3343   ins_encode %{
 3344     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3345   %}
 3346   ins_pipe(pipe_slow);
 3347 %}
 3348 
 3349 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3350   predicate(UseAVX > 0);
 3351   match(Set dst (MulD src1 (LoadD src2)));
 3352 
 3353   format %{ "vmulsd  $dst, $src1, $src2" %}
 3354   ins_cost(150);
 3355   ins_encode %{
 3356     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3357   %}
 3358   ins_pipe(pipe_slow);
 3359 %}
 3360 
 3361 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3362   predicate(UseAVX > 0);
 3363   match(Set dst (MulD src con));
 3364 
 3365   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3366   ins_cost(150);
 3367   ins_encode %{
 3368     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3369   %}
 3370   ins_pipe(pipe_slow);
 3371 %}
 3372 
 3373 instruct divF_reg(regF dst, regF src) %{
 3374   predicate((UseSSE>=1) && (UseAVX == 0));
 3375   match(Set dst (DivF dst src));
 3376 
 3377   format %{ "divss   $dst, $src" %}
 3378   ins_cost(150);
 3379   ins_encode %{
 3380     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3381   %}
 3382   ins_pipe(pipe_slow);
 3383 %}
 3384 
 3385 instruct divF_mem(regF dst, memory src) %{
 3386   predicate((UseSSE>=1) && (UseAVX == 0));
 3387   match(Set dst (DivF dst (LoadF src)));
 3388 
 3389   format %{ "divss   $dst, $src" %}
 3390   ins_cost(150);
 3391   ins_encode %{
 3392     __ divss($dst$$XMMRegister, $src$$Address);
 3393   %}
 3394   ins_pipe(pipe_slow);
 3395 %}
 3396 
 3397 instruct divF_imm(regF dst, immF con) %{
 3398   predicate((UseSSE>=1) && (UseAVX == 0));
 3399   match(Set dst (DivF dst con));
 3400   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3401   ins_cost(150);
 3402   ins_encode %{
 3403     __ divss($dst$$XMMRegister, $constantaddress($con));
 3404   %}
 3405   ins_pipe(pipe_slow);
 3406 %}
 3407 
 3408 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3409   predicate(UseAVX > 0);
 3410   match(Set dst (DivF src1 src2));
 3411 
 3412   format %{ "vdivss  $dst, $src1, $src2" %}
 3413   ins_cost(150);
 3414   ins_encode %{
 3415     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3416   %}
 3417   ins_pipe(pipe_slow);
 3418 %}
 3419 
 3420 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3421   predicate(UseAVX > 0);
 3422   match(Set dst (DivF src1 (LoadF src2)));
 3423 
 3424   format %{ "vdivss  $dst, $src1, $src2" %}
 3425   ins_cost(150);
 3426   ins_encode %{
 3427     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3428   %}
 3429   ins_pipe(pipe_slow);
 3430 %}
 3431 
 3432 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3433   predicate(UseAVX > 0);
 3434   match(Set dst (DivF src con));
 3435 
 3436   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3437   ins_cost(150);
 3438   ins_encode %{
 3439     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3440   %}
 3441   ins_pipe(pipe_slow);
 3442 %}
 3443 
 3444 instruct divD_reg(regD dst, regD src) %{
 3445   predicate((UseSSE>=2) && (UseAVX == 0));
 3446   match(Set dst (DivD dst src));
 3447 
 3448   format %{ "divsd   $dst, $src" %}
 3449   ins_cost(150);
 3450   ins_encode %{
 3451     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3452   %}
 3453   ins_pipe(pipe_slow);
 3454 %}
 3455 
 3456 instruct divD_mem(regD dst, memory src) %{
 3457   predicate((UseSSE>=2) && (UseAVX == 0));
 3458   match(Set dst (DivD dst (LoadD src)));
 3459 
 3460   format %{ "divsd   $dst, $src" %}
 3461   ins_cost(150);
 3462   ins_encode %{
 3463     __ divsd($dst$$XMMRegister, $src$$Address);
 3464   %}
 3465   ins_pipe(pipe_slow);
 3466 %}
 3467 
 3468 instruct divD_imm(regD dst, immD con) %{
 3469   predicate((UseSSE>=2) && (UseAVX == 0));
 3470   match(Set dst (DivD dst con));
 3471   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3472   ins_cost(150);
 3473   ins_encode %{
 3474     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3475   %}
 3476   ins_pipe(pipe_slow);
 3477 %}
 3478 
 3479 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3480   predicate(UseAVX > 0);
 3481   match(Set dst (DivD src1 src2));
 3482 
 3483   format %{ "vdivsd  $dst, $src1, $src2" %}
 3484   ins_cost(150);
 3485   ins_encode %{
 3486     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3487   %}
 3488   ins_pipe(pipe_slow);
 3489 %}
 3490 
 3491 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3492   predicate(UseAVX > 0);
 3493   match(Set dst (DivD src1 (LoadD src2)));
 3494 
 3495   format %{ "vdivsd  $dst, $src1, $src2" %}
 3496   ins_cost(150);
 3497   ins_encode %{
 3498     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3499   %}
 3500   ins_pipe(pipe_slow);
 3501 %}
 3502 
 3503 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3504   predicate(UseAVX > 0);
 3505   match(Set dst (DivD src con));
 3506 
 3507   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3508   ins_cost(150);
 3509   ins_encode %{
 3510     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3511   %}
 3512   ins_pipe(pipe_slow);
 3513 %}
 3514 
 3515 instruct absF_reg(regF dst) %{
 3516   predicate((UseSSE>=1) && (UseAVX == 0));
 3517   match(Set dst (AbsF dst));
 3518   ins_cost(150);
 3519   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3520   ins_encode %{
 3521     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3522   %}
 3523   ins_pipe(pipe_slow);
 3524 %}
 3525 
 3526 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3527   predicate(UseAVX > 0);
 3528   match(Set dst (AbsF src));
 3529   ins_cost(150);
 3530   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3531   ins_encode %{
 3532     int vlen_enc = Assembler::AVX_128bit;
 3533     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3534               ExternalAddress(float_signmask()), vlen_enc);
 3535   %}
 3536   ins_pipe(pipe_slow);
 3537 %}
 3538 
 3539 instruct absD_reg(regD dst) %{
 3540   predicate((UseSSE>=2) && (UseAVX == 0));
 3541   match(Set dst (AbsD dst));
 3542   ins_cost(150);
 3543   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3544             "# abs double by sign masking" %}
 3545   ins_encode %{
 3546     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3547   %}
 3548   ins_pipe(pipe_slow);
 3549 %}
 3550 
 3551 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3552   predicate(UseAVX > 0);
 3553   match(Set dst (AbsD src));
 3554   ins_cost(150);
 3555   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3556             "# abs double by sign masking" %}
 3557   ins_encode %{
 3558     int vlen_enc = Assembler::AVX_128bit;
 3559     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3560               ExternalAddress(double_signmask()), vlen_enc);
 3561   %}
 3562   ins_pipe(pipe_slow);
 3563 %}
 3564 
 3565 instruct negF_reg(regF dst) %{
 3566   predicate((UseSSE>=1) && (UseAVX == 0));
 3567   match(Set dst (NegF dst));
 3568   ins_cost(150);
 3569   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3570   ins_encode %{
 3571     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3572   %}
 3573   ins_pipe(pipe_slow);
 3574 %}
 3575 
 3576 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3577   predicate(UseAVX > 0);
 3578   match(Set dst (NegF src));
 3579   ins_cost(150);
 3580   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3581   ins_encode %{
 3582     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3583                  ExternalAddress(float_signflip()));
 3584   %}
 3585   ins_pipe(pipe_slow);
 3586 %}
 3587 
 3588 instruct negD_reg(regD dst) %{
 3589   predicate((UseSSE>=2) && (UseAVX == 0));
 3590   match(Set dst (NegD dst));
 3591   ins_cost(150);
 3592   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3593             "# neg double by sign flipping" %}
 3594   ins_encode %{
 3595     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3596   %}
 3597   ins_pipe(pipe_slow);
 3598 %}
 3599 
 3600 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3601   predicate(UseAVX > 0);
 3602   match(Set dst (NegD src));
 3603   ins_cost(150);
 3604   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3605             "# neg double by sign flipping" %}
 3606   ins_encode %{
 3607     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3608                  ExternalAddress(double_signflip()));
 3609   %}
 3610   ins_pipe(pipe_slow);
 3611 %}
 3612 
 3613 // sqrtss instruction needs destination register to be pre initialized for best performance
 3614 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3615 instruct sqrtF_reg(regF dst) %{
 3616   predicate(UseSSE>=1);
 3617   match(Set dst (SqrtF dst));
 3618   format %{ "sqrtss  $dst, $dst" %}
 3619   ins_encode %{
 3620     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3621   %}
 3622   ins_pipe(pipe_slow);
 3623 %}
 3624 
 3625 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3626 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3627 instruct sqrtD_reg(regD dst) %{
 3628   predicate(UseSSE>=2);
 3629   match(Set dst (SqrtD dst));
 3630   format %{ "sqrtsd  $dst, $dst" %}
 3631   ins_encode %{
 3632     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3633   %}
 3634   ins_pipe(pipe_slow);
 3635 %}
 3636 
 3637 instruct convF2HF_reg_reg(rRegI dst, vlRegF src, vlRegF tmp) %{
 3638   effect(TEMP tmp);
 3639   match(Set dst (ConvF2HF src));
 3640   ins_cost(125);
 3641   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3642   ins_encode %{
 3643     __ flt_to_flt16($dst$$Register, $src$$XMMRegister, $tmp$$XMMRegister);
 3644   %}
 3645   ins_pipe( pipe_slow );
 3646 %}
 3647 
 3648 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3649   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3650   effect(TEMP ktmp, TEMP rtmp);
 3651   match(Set mem (StoreC mem (ConvF2HF src)));
 3652   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3653   ins_encode %{
 3654     __ movl($rtmp$$Register, 0x1);
 3655     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3656     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3657   %}
 3658   ins_pipe( pipe_slow );
 3659 %}
 3660 
 3661 instruct vconvF2HF(vec dst, vec src) %{
 3662   match(Set dst (VectorCastF2HF src));
 3663   format %{ "vector_conv_F2HF $dst $src" %}
 3664   ins_encode %{
 3665     int vlen_enc = vector_length_encoding(this, $src);
 3666     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3667   %}
 3668   ins_pipe( pipe_slow );
 3669 %}
 3670 
 3671 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3672   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3673   format %{ "vcvtps2ph $mem,$src" %}
 3674   ins_encode %{
 3675     int vlen_enc = vector_length_encoding(this, $src);
 3676     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3677   %}
 3678   ins_pipe( pipe_slow );
 3679 %}
 3680 
 3681 instruct convHF2F_reg_reg(vlRegF dst, rRegI src) %{
 3682   match(Set dst (ConvHF2F src));
 3683   format %{ "vcvtph2ps $dst,$src" %}
 3684   ins_encode %{
 3685     __ flt16_to_flt($dst$$XMMRegister, $src$$Register);
 3686   %}
 3687   ins_pipe( pipe_slow );
 3688 %}
 3689 
 3690 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3691   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3692   format %{ "vcvtph2ps $dst,$mem" %}
 3693   ins_encode %{
 3694     int vlen_enc = vector_length_encoding(this);
 3695     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3696   %}
 3697   ins_pipe( pipe_slow );
 3698 %}
 3699 
 3700 instruct vconvHF2F(vec dst, vec src) %{
 3701   match(Set dst (VectorCastHF2F src));
 3702   ins_cost(125);
 3703   format %{ "vector_conv_HF2F $dst,$src" %}
 3704   ins_encode %{
 3705     int vlen_enc = vector_length_encoding(this);
 3706     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3707   %}
 3708   ins_pipe( pipe_slow );
 3709 %}
 3710 
 3711 // ---------------------------------------- VectorReinterpret ------------------------------------
 3712 instruct reinterpret_mask(kReg dst) %{
 3713   predicate(n->bottom_type()->isa_vectmask() &&
 3714             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3715   match(Set dst (VectorReinterpret dst));
 3716   ins_cost(125);
 3717   format %{ "vector_reinterpret $dst\t!" %}
 3718   ins_encode %{
 3719     // empty
 3720   %}
 3721   ins_pipe( pipe_slow );
 3722 %}
 3723 
 3724 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3725   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3726             n->bottom_type()->isa_vectmask() &&
 3727             n->in(1)->bottom_type()->isa_vectmask() &&
 3728             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3729             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3730   match(Set dst (VectorReinterpret src));
 3731   effect(TEMP xtmp);
 3732   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3733   ins_encode %{
 3734      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3735      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3736      assert(src_sz == dst_sz , "src and dst size mismatch");
 3737      int vlen_enc = vector_length_encoding(src_sz);
 3738      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3739      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3740   %}
 3741   ins_pipe( pipe_slow );
 3742 %}
 3743 
 3744 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3745   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3746             n->bottom_type()->isa_vectmask() &&
 3747             n->in(1)->bottom_type()->isa_vectmask() &&
 3748             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3749              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3750             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3751   match(Set dst (VectorReinterpret src));
 3752   effect(TEMP xtmp);
 3753   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3754   ins_encode %{
 3755      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3756      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3757      assert(src_sz == dst_sz , "src and dst size mismatch");
 3758      int vlen_enc = vector_length_encoding(src_sz);
 3759      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3760      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3761   %}
 3762   ins_pipe( pipe_slow );
 3763 %}
 3764 
 3765 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3766   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3767             n->bottom_type()->isa_vectmask() &&
 3768             n->in(1)->bottom_type()->isa_vectmask() &&
 3769             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3770              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3771             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3772   match(Set dst (VectorReinterpret src));
 3773   effect(TEMP xtmp);
 3774   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3775   ins_encode %{
 3776      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3777      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3778      assert(src_sz == dst_sz , "src and dst size mismatch");
 3779      int vlen_enc = vector_length_encoding(src_sz);
 3780      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3781      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3782   %}
 3783   ins_pipe( pipe_slow );
 3784 %}
 3785 
 3786 instruct reinterpret(vec dst) %{
 3787   predicate(!n->bottom_type()->isa_vectmask() &&
 3788             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3789   match(Set dst (VectorReinterpret dst));
 3790   ins_cost(125);
 3791   format %{ "vector_reinterpret $dst\t!" %}
 3792   ins_encode %{
 3793     // empty
 3794   %}
 3795   ins_pipe( pipe_slow );
 3796 %}
 3797 
 3798 instruct reinterpret_expand(vec dst, vec src) %{
 3799   predicate(UseAVX == 0 &&
 3800             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3801   match(Set dst (VectorReinterpret src));
 3802   ins_cost(125);
 3803   effect(TEMP dst);
 3804   format %{ "vector_reinterpret_expand $dst,$src" %}
 3805   ins_encode %{
 3806     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3807     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3808 
 3809     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3810     if (src_vlen_in_bytes == 4) {
 3811       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3812     } else {
 3813       assert(src_vlen_in_bytes == 8, "");
 3814       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3815     }
 3816     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3817   %}
 3818   ins_pipe( pipe_slow );
 3819 %}
 3820 
 3821 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3822   predicate(UseAVX > 0 &&
 3823             !n->bottom_type()->isa_vectmask() &&
 3824             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3825             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3826   match(Set dst (VectorReinterpret src));
 3827   ins_cost(125);
 3828   format %{ "vector_reinterpret_expand $dst,$src" %}
 3829   ins_encode %{
 3830     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3831   %}
 3832   ins_pipe( pipe_slow );
 3833 %}
 3834 
 3835 
 3836 instruct vreinterpret_expand(legVec dst, vec src) %{
 3837   predicate(UseAVX > 0 &&
 3838             !n->bottom_type()->isa_vectmask() &&
 3839             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3840             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3841   match(Set dst (VectorReinterpret src));
 3842   ins_cost(125);
 3843   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3844   ins_encode %{
 3845     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3846       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3847       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3848       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3849       default: ShouldNotReachHere();
 3850     }
 3851   %}
 3852   ins_pipe( pipe_slow );
 3853 %}
 3854 
 3855 instruct reinterpret_shrink(vec dst, legVec src) %{
 3856   predicate(!n->bottom_type()->isa_vectmask() &&
 3857             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3858   match(Set dst (VectorReinterpret src));
 3859   ins_cost(125);
 3860   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3861   ins_encode %{
 3862     switch (Matcher::vector_length_in_bytes(this)) {
 3863       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3864       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3865       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3866       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3867       default: ShouldNotReachHere();
 3868     }
 3869   %}
 3870   ins_pipe( pipe_slow );
 3871 %}
 3872 
 3873 // ----------------------------------------------------------------------------------------------------
 3874 
 3875 #ifdef _LP64
 3876 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3877   match(Set dst (RoundDoubleMode src rmode));
 3878   format %{ "roundsd $dst,$src" %}
 3879   ins_cost(150);
 3880   ins_encode %{
 3881     assert(UseSSE >= 4, "required");
 3882     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3883   %}
 3884   ins_pipe(pipe_slow);
 3885 %}
 3886 
 3887 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3888   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3889   format %{ "roundsd $dst,$src" %}
 3890   ins_cost(150);
 3891   ins_encode %{
 3892     assert(UseSSE >= 4, "required");
 3893     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3894   %}
 3895   ins_pipe(pipe_slow);
 3896 %}
 3897 
 3898 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3899   match(Set dst (RoundDoubleMode con rmode));
 3900   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3901   ins_cost(150);
 3902   ins_encode %{
 3903     assert(UseSSE >= 4, "required");
 3904     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3905   %}
 3906   ins_pipe(pipe_slow);
 3907 %}
 3908 
 3909 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3910   predicate(Matcher::vector_length(n) < 8);
 3911   match(Set dst (RoundDoubleModeV src rmode));
 3912   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3913   ins_encode %{
 3914     assert(UseAVX > 0, "required");
 3915     int vlen_enc = vector_length_encoding(this);
 3916     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3917   %}
 3918   ins_pipe( pipe_slow );
 3919 %}
 3920 
 3921 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3922   predicate(Matcher::vector_length(n) == 8);
 3923   match(Set dst (RoundDoubleModeV src rmode));
 3924   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3925   ins_encode %{
 3926     assert(UseAVX > 2, "required");
 3927     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3928   %}
 3929   ins_pipe( pipe_slow );
 3930 %}
 3931 
 3932 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3933   predicate(Matcher::vector_length(n) < 8);
 3934   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3935   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3936   ins_encode %{
 3937     assert(UseAVX > 0, "required");
 3938     int vlen_enc = vector_length_encoding(this);
 3939     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3940   %}
 3941   ins_pipe( pipe_slow );
 3942 %}
 3943 
 3944 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3945   predicate(Matcher::vector_length(n) == 8);
 3946   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3947   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3948   ins_encode %{
 3949     assert(UseAVX > 2, "required");
 3950     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3951   %}
 3952   ins_pipe( pipe_slow );
 3953 %}
 3954 #endif // _LP64
 3955 
 3956 instruct onspinwait() %{
 3957   match(OnSpinWait);
 3958   ins_cost(200);
 3959 
 3960   format %{
 3961     $$template
 3962     $$emit$$"pause\t! membar_onspinwait"
 3963   %}
 3964   ins_encode %{
 3965     __ pause();
 3966   %}
 3967   ins_pipe(pipe_slow);
 3968 %}
 3969 
 3970 // a * b + c
 3971 instruct fmaD_reg(regD a, regD b, regD c) %{
 3972   match(Set c (FmaD  c (Binary a b)));
 3973   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3974   ins_cost(150);
 3975   ins_encode %{
 3976     assert(UseFMA, "Needs FMA instructions support.");
 3977     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3978   %}
 3979   ins_pipe( pipe_slow );
 3980 %}
 3981 
 3982 // a * b + c
 3983 instruct fmaF_reg(regF a, regF b, regF c) %{
 3984   match(Set c (FmaF  c (Binary a b)));
 3985   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3986   ins_cost(150);
 3987   ins_encode %{
 3988     assert(UseFMA, "Needs FMA instructions support.");
 3989     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3990   %}
 3991   ins_pipe( pipe_slow );
 3992 %}
 3993 
 3994 // ====================VECTOR INSTRUCTIONS=====================================
 3995 
 3996 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3997 instruct MoveVec2Leg(legVec dst, vec src) %{
 3998   match(Set dst src);
 3999   format %{ "" %}
 4000   ins_encode %{
 4001     ShouldNotReachHere();
 4002   %}
 4003   ins_pipe( fpu_reg_reg );
 4004 %}
 4005 
 4006 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4007   match(Set dst src);
 4008   format %{ "" %}
 4009   ins_encode %{
 4010     ShouldNotReachHere();
 4011   %}
 4012   ins_pipe( fpu_reg_reg );
 4013 %}
 4014 
 4015 // ============================================================================
 4016 
 4017 // Load vectors generic operand pattern
 4018 instruct loadV(vec dst, memory mem) %{
 4019   match(Set dst (LoadVector mem));
 4020   ins_cost(125);
 4021   format %{ "load_vector $dst,$mem" %}
 4022   ins_encode %{
 4023     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4024   %}
 4025   ins_pipe( pipe_slow );
 4026 %}
 4027 
 4028 // Store vectors generic operand pattern.
 4029 instruct storeV(memory mem, vec src) %{
 4030   match(Set mem (StoreVector mem src));
 4031   ins_cost(145);
 4032   format %{ "store_vector $mem,$src\n\t" %}
 4033   ins_encode %{
 4034     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4035       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4036       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4037       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4038       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4039       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4040       default: ShouldNotReachHere();
 4041     }
 4042   %}
 4043   ins_pipe( pipe_slow );
 4044 %}
 4045 
 4046 // ---------------------------------------- Gather ------------------------------------
 4047 
 4048 // Gather INT, LONG, FLOAT, DOUBLE
 4049 
 4050 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4051   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4052   match(Set dst (LoadVectorGather mem idx));
 4053   effect(TEMP dst, TEMP tmp, TEMP mask);
 4054   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4055   ins_encode %{
 4056     assert(UseAVX >= 2, "sanity");
 4057 
 4058     int vlen_enc = vector_length_encoding(this);
 4059     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4060 
 4061     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 4062     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4063 
 4064     if (vlen_enc == Assembler::AVX_128bit) {
 4065       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4066     } else {
 4067       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4068     }
 4069     __ lea($tmp$$Register, $mem$$Address);
 4070     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4071   %}
 4072   ins_pipe( pipe_slow );
 4073 %}
 4074 
 4075 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4076   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4077   match(Set dst (LoadVectorGather mem idx));
 4078   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4079   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4080   ins_encode %{
 4081     assert(UseAVX > 2, "sanity");
 4082 
 4083     int vlen_enc = vector_length_encoding(this);
 4084     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4085 
 4086     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4087 
 4088     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4089     __ lea($tmp$$Register, $mem$$Address);
 4090     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4091   %}
 4092   ins_pipe( pipe_slow );
 4093 %}
 4094 
 4095 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4096   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4097   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4098   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4099   ins_encode %{
 4100     assert(UseAVX > 2, "sanity");
 4101     int vlen_enc = vector_length_encoding(this);
 4102     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4103     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4104     // Note: Since gather instruction partially updates the opmask register used
 4105     // for predication hense moving mask operand to a temporary.
 4106     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4107     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4108     __ lea($tmp$$Register, $mem$$Address);
 4109     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4110   %}
 4111   ins_pipe( pipe_slow );
 4112 %}
 4113 // ====================Scatter=======================================
 4114 
 4115 // Scatter INT, LONG, FLOAT, DOUBLE
 4116 
 4117 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4118   predicate(UseAVX > 2);
 4119   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4120   effect(TEMP tmp, TEMP ktmp);
 4121   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4122   ins_encode %{
 4123     int vlen_enc = vector_length_encoding(this, $src);
 4124     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4125 
 4126     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4127     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4128 
 4129     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4130     __ lea($tmp$$Register, $mem$$Address);
 4131     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4132   %}
 4133   ins_pipe( pipe_slow );
 4134 %}
 4135 
 4136 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4137   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4138   effect(TEMP tmp, TEMP ktmp);
 4139   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4140   ins_encode %{
 4141     int vlen_enc = vector_length_encoding(this, $src);
 4142     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4143     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4144     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4145     // Note: Since scatter instruction partially updates the opmask register used
 4146     // for predication hense moving mask operand to a temporary.
 4147     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4148     __ lea($tmp$$Register, $mem$$Address);
 4149     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4150   %}
 4151   ins_pipe( pipe_slow );
 4152 %}
 4153 
 4154 // ====================REPLICATE=======================================
 4155 
 4156 // Replicate byte scalar to be vector
 4157 instruct vReplB_reg(vec dst, rRegI src) %{
 4158   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 4159   match(Set dst (Replicate src));
 4160   format %{ "replicateB $dst,$src" %}
 4161   ins_encode %{
 4162     uint vlen = Matcher::vector_length(this);
 4163     if (UseAVX >= 2) {
 4164       int vlen_enc = vector_length_encoding(this);
 4165       if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4166         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4167         __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4168       } else {
 4169         __ movdl($dst$$XMMRegister, $src$$Register);
 4170         __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4171       }
 4172     } else {
 4173        assert(UseAVX < 2, "");
 4174       __ movdl($dst$$XMMRegister, $src$$Register);
 4175       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4176       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4177       if (vlen >= 16) {
 4178         assert(vlen == 16, "");
 4179         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4180       }
 4181     }
 4182   %}
 4183   ins_pipe( pipe_slow );
 4184 %}
 4185 
 4186 instruct ReplB_mem(vec dst, memory mem) %{
 4187   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_BYTE);
 4188   match(Set dst (Replicate (LoadB mem)));
 4189   format %{ "replicateB $dst,$mem" %}
 4190   ins_encode %{
 4191     int vlen_enc = vector_length_encoding(this);
 4192     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4193   %}
 4194   ins_pipe( pipe_slow );
 4195 %}
 4196 
 4197 // ====================ReplicateS=======================================
 4198 
 4199 instruct vReplS_reg(vec dst, rRegI src) %{
 4200   predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
 4201   match(Set dst (Replicate src));
 4202   format %{ "replicateS $dst,$src" %}
 4203   ins_encode %{
 4204     uint vlen = Matcher::vector_length(this);
 4205     int vlen_enc = vector_length_encoding(this);
 4206     if (UseAVX >= 2) {
 4207       if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4208         assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4209         __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4210       } else {
 4211         __ movdl($dst$$XMMRegister, $src$$Register);
 4212         __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4213       }
 4214     } else {
 4215       assert(UseAVX < 2, "");
 4216       __ movdl($dst$$XMMRegister, $src$$Register);
 4217       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4218       if (vlen >= 8) {
 4219         assert(vlen == 8, "");
 4220         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4221       }
 4222     }
 4223   %}
 4224   ins_pipe( pipe_slow );
 4225 %}
 4226 
 4227 instruct ReplS_mem(vec dst, memory mem) %{
 4228   predicate(UseAVX >= 2 && Matcher::vector_element_basic_type(n) == T_SHORT);
 4229   match(Set dst (Replicate (LoadS mem)));
 4230   format %{ "replicateS $dst,$mem" %}
 4231   ins_encode %{
 4232     int vlen_enc = vector_length_encoding(this);
 4233     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4234   %}
 4235   ins_pipe( pipe_slow );
 4236 %}
 4237 
 4238 // ====================ReplicateI=======================================
 4239 
 4240 instruct ReplI_reg(vec dst, rRegI src) %{
 4241   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4242   match(Set dst (Replicate src));
 4243   format %{ "replicateI $dst,$src" %}
 4244   ins_encode %{
 4245     uint vlen = Matcher::vector_length(this);
 4246     int vlen_enc = vector_length_encoding(this);
 4247     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4248       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4249     } else if (VM_Version::supports_avx2()) {
 4250       __ movdl($dst$$XMMRegister, $src$$Register);
 4251       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4252     } else {
 4253       __ movdl($dst$$XMMRegister, $src$$Register);
 4254       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4255     }
 4256   %}
 4257   ins_pipe( pipe_slow );
 4258 %}
 4259 
 4260 instruct ReplI_mem(vec dst, memory mem) %{
 4261   predicate(Matcher::vector_element_basic_type(n) == T_INT);
 4262   match(Set dst (Replicate (LoadI mem)));
 4263   format %{ "replicateI $dst,$mem" %}
 4264   ins_encode %{
 4265     int vlen_enc = vector_length_encoding(this);
 4266     if (VM_Version::supports_avx2()) {
 4267       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4268     } else if (VM_Version::supports_avx()) {
 4269       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4270     } else {
 4271       __ movdl($dst$$XMMRegister, $mem$$Address);
 4272       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4273     }
 4274   %}
 4275   ins_pipe( pipe_slow );
 4276 %}
 4277 
 4278 instruct ReplI_imm(vec dst, immI con) %{
 4279   predicate(Matcher::is_non_long_integral_vector(n));
 4280   match(Set dst (Replicate con));
 4281   format %{ "replicateI $dst,$con" %}
 4282   ins_encode %{
 4283     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4284         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4285             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4286                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4287     BasicType bt = Matcher::vector_element_basic_type(this);
 4288     int vlen = Matcher::vector_length_in_bytes(this);
 4289     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4290   %}
 4291   ins_pipe( pipe_slow );
 4292 %}
 4293 
 4294 // Replicate scalar zero to be vector
 4295 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4296   predicate(Matcher::is_non_long_integral_vector(n));
 4297   match(Set dst (Replicate zero));
 4298   format %{ "replicateI $dst,$zero" %}
 4299   ins_encode %{
 4300     int vlen_enc = vector_length_encoding(this);
 4301     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4302       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4303     } else {
 4304       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4305     }
 4306   %}
 4307   ins_pipe( fpu_reg_reg );
 4308 %}
 4309 
 4310 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4311   predicate(UseSSE >= 2 && Matcher::is_non_long_integral_vector(n));
 4312   match(Set dst (Replicate con));
 4313   format %{ "vallones $dst" %}
 4314   ins_encode %{
 4315     int vector_len = vector_length_encoding(this);
 4316     __ vallones($dst$$XMMRegister, vector_len);
 4317   %}
 4318   ins_pipe( pipe_slow );
 4319 %}
 4320 
 4321 // ====================ReplicateL=======================================
 4322 
 4323 #ifdef _LP64
 4324 // Replicate long (8 byte) scalar to be vector
 4325 instruct ReplL_reg(vec dst, rRegL src) %{
 4326   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4327   match(Set dst (Replicate src));
 4328   format %{ "replicateL $dst,$src" %}
 4329   ins_encode %{
 4330     int vlen = Matcher::vector_length(this);
 4331     int vlen_enc = vector_length_encoding(this);
 4332     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4333       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4334     } else if (VM_Version::supports_avx2()) {
 4335       __ movdq($dst$$XMMRegister, $src$$Register);
 4336       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4337     } else {
 4338       __ movdq($dst$$XMMRegister, $src$$Register);
 4339       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4340     }
 4341   %}
 4342   ins_pipe( pipe_slow );
 4343 %}
 4344 #else // _LP64
 4345 // Replicate long (8 byte) scalar to be vector
 4346 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4347   predicate(Matcher::vector_length(n) <= 4 && Matcher::vector_element_basic_type(n) == T_LONG);
 4348   match(Set dst (Replicate src));
 4349   effect(TEMP dst, USE src, TEMP tmp);
 4350   format %{ "replicateL $dst,$src" %}
 4351   ins_encode %{
 4352     uint vlen = Matcher::vector_length(this);
 4353     if (vlen == 2) {
 4354       __ movdl($dst$$XMMRegister, $src$$Register);
 4355       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4356       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4357       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4358     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4359       int vlen_enc = Assembler::AVX_256bit;
 4360       __ movdl($dst$$XMMRegister, $src$$Register);
 4361       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4362       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4363       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4364     } else {
 4365       __ movdl($dst$$XMMRegister, $src$$Register);
 4366       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4367       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4368       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4369       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4370     }
 4371   %}
 4372   ins_pipe( pipe_slow );
 4373 %}
 4374 
 4375 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4376   predicate(Matcher::vector_length(n) == 8 && Matcher::vector_element_basic_type(n) == T_LONG);
 4377   match(Set dst (Replicate src));
 4378   effect(TEMP dst, USE src, TEMP tmp);
 4379   format %{ "replicateL $dst,$src" %}
 4380   ins_encode %{
 4381     if (VM_Version::supports_avx512vl()) {
 4382       __ movdl($dst$$XMMRegister, $src$$Register);
 4383       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4384       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4385       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4386       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4387       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4388     } else {
 4389       int vlen_enc = Assembler::AVX_512bit;
 4390       __ movdl($dst$$XMMRegister, $src$$Register);
 4391       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4392       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4393       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4394     }
 4395   %}
 4396   ins_pipe( pipe_slow );
 4397 %}
 4398 #endif // _LP64
 4399 
 4400 instruct ReplL_mem(vec dst, memory mem) %{
 4401   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4402   match(Set dst (Replicate (LoadL mem)));
 4403   format %{ "replicateL $dst,$mem" %}
 4404   ins_encode %{
 4405     int vlen_enc = vector_length_encoding(this);
 4406     if (VM_Version::supports_avx2()) {
 4407       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4408     } else if (VM_Version::supports_sse3()) {
 4409       __ movddup($dst$$XMMRegister, $mem$$Address);
 4410     } else {
 4411       __ movq($dst$$XMMRegister, $mem$$Address);
 4412       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4413     }
 4414   %}
 4415   ins_pipe( pipe_slow );
 4416 %}
 4417 
 4418 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4419 instruct ReplL_imm(vec dst, immL con) %{
 4420   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4421   match(Set dst (Replicate con));
 4422   format %{ "replicateL $dst,$con" %}
 4423   ins_encode %{
 4424     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4425     int vlen = Matcher::vector_length_in_bytes(this);
 4426     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4427   %}
 4428   ins_pipe( pipe_slow );
 4429 %}
 4430 
 4431 instruct ReplL_zero(vec dst, immL0 zero) %{
 4432   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 4433   match(Set dst (Replicate zero));
 4434   format %{ "replicateL $dst,$zero" %}
 4435   ins_encode %{
 4436     int vlen_enc = vector_length_encoding(this);
 4437     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4438       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4439     } else {
 4440       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4441     }
 4442   %}
 4443   ins_pipe( fpu_reg_reg );
 4444 %}
 4445 
 4446 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4447   predicate(UseSSE >= 2 && Matcher::vector_element_basic_type(n) == T_LONG);
 4448   match(Set dst (Replicate con));
 4449   format %{ "vallones $dst" %}
 4450   ins_encode %{
 4451     int vector_len = vector_length_encoding(this);
 4452     __ vallones($dst$$XMMRegister, vector_len);
 4453   %}
 4454   ins_pipe( pipe_slow );
 4455 %}
 4456 
 4457 // ====================ReplicateF=======================================
 4458 
 4459 instruct vReplF_reg(vec dst, vlRegF src) %{
 4460   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4461   match(Set dst (Replicate src));
 4462   format %{ "replicateF $dst,$src" %}
 4463   ins_encode %{
 4464     uint vlen = Matcher::vector_length(this);
 4465     int vlen_enc = vector_length_encoding(this);
 4466     if (vlen <= 4) {
 4467       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4468     } else if (VM_Version::supports_avx2()) {
 4469       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4470     } else {
 4471       assert(vlen == 8, "sanity");
 4472       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4473       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4474     }
 4475   %}
 4476   ins_pipe( pipe_slow );
 4477 %}
 4478 
 4479 instruct ReplF_reg(vec dst, vlRegF src) %{
 4480   predicate(UseAVX == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4481   match(Set dst (Replicate src));
 4482   format %{ "replicateF $dst,$src" %}
 4483   ins_encode %{
 4484     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4485   %}
 4486   ins_pipe( pipe_slow );
 4487 %}
 4488 
 4489 instruct ReplF_mem(vec dst, memory mem) %{
 4490   predicate(UseAVX > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
 4491   match(Set dst (Replicate (LoadF mem)));
 4492   format %{ "replicateF $dst,$mem" %}
 4493   ins_encode %{
 4494     int vlen_enc = vector_length_encoding(this);
 4495     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4496   %}
 4497   ins_pipe( pipe_slow );
 4498 %}
 4499 
 4500 // Replicate float scalar immediate to be vector by loading from const table.
 4501 instruct ReplF_imm(vec dst, immF con) %{
 4502   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4503   match(Set dst (Replicate con));
 4504   format %{ "replicateF $dst,$con" %}
 4505   ins_encode %{
 4506     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4507         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4508     int vlen = Matcher::vector_length_in_bytes(this);
 4509     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4510   %}
 4511   ins_pipe( pipe_slow );
 4512 %}
 4513 
 4514 instruct ReplF_zero(vec dst, immF0 zero) %{
 4515   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 4516   match(Set dst (Replicate zero));
 4517   format %{ "replicateF $dst,$zero" %}
 4518   ins_encode %{
 4519     int vlen_enc = vector_length_encoding(this);
 4520     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4521       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4522     } else {
 4523       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4524     }
 4525   %}
 4526   ins_pipe( fpu_reg_reg );
 4527 %}
 4528 
 4529 // ====================ReplicateD=======================================
 4530 
 4531 // Replicate double (8 bytes) scalar to be vector
 4532 instruct vReplD_reg(vec dst, vlRegD src) %{
 4533   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4534   match(Set dst (Replicate src));
 4535   format %{ "replicateD $dst,$src" %}
 4536   ins_encode %{
 4537     uint vlen = Matcher::vector_length(this);
 4538     int vlen_enc = vector_length_encoding(this);
 4539     if (vlen <= 2) {
 4540       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4541     } else if (VM_Version::supports_avx2()) {
 4542       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4543     } else {
 4544       assert(vlen == 4, "sanity");
 4545       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4546       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4547     }
 4548   %}
 4549   ins_pipe( pipe_slow );
 4550 %}
 4551 
 4552 instruct ReplD_reg(vec dst, vlRegD src) %{
 4553   predicate(UseSSE < 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4554   match(Set dst (Replicate src));
 4555   format %{ "replicateD $dst,$src" %}
 4556   ins_encode %{
 4557     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4558   %}
 4559   ins_pipe( pipe_slow );
 4560 %}
 4561 
 4562 instruct ReplD_mem(vec dst, memory mem) %{
 4563   predicate(UseSSE >= 3 && Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4564   match(Set dst (Replicate (LoadD mem)));
 4565   format %{ "replicateD $dst,$mem" %}
 4566   ins_encode %{
 4567     if (Matcher::vector_length(this) >= 4) {
 4568       int vlen_enc = vector_length_encoding(this);
 4569       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4570     } else {
 4571       __ movddup($dst$$XMMRegister, $mem$$Address);
 4572     }
 4573   %}
 4574   ins_pipe( pipe_slow );
 4575 %}
 4576 
 4577 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4578 instruct ReplD_imm(vec dst, immD con) %{
 4579   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4580   match(Set dst (Replicate con));
 4581   format %{ "replicateD $dst,$con" %}
 4582   ins_encode %{
 4583     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4584     int vlen = Matcher::vector_length_in_bytes(this);
 4585     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4586   %}
 4587   ins_pipe( pipe_slow );
 4588 %}
 4589 
 4590 instruct ReplD_zero(vec dst, immD0 zero) %{
 4591   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 4592   match(Set dst (Replicate zero));
 4593   format %{ "replicateD $dst,$zero" %}
 4594   ins_encode %{
 4595     int vlen_enc = vector_length_encoding(this);
 4596     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4597       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4598     } else {
 4599       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4600     }
 4601   %}
 4602   ins_pipe( fpu_reg_reg );
 4603 %}
 4604 
 4605 // ====================VECTOR INSERT=======================================
 4606 
 4607 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4608   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4609   match(Set dst (VectorInsert (Binary dst val) idx));
 4610   format %{ "vector_insert $dst,$val,$idx" %}
 4611   ins_encode %{
 4612     assert(UseSSE >= 4, "required");
 4613     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4614 
 4615     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4616 
 4617     assert(is_integral_type(elem_bt), "");
 4618     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4619 
 4620     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4621   %}
 4622   ins_pipe( pipe_slow );
 4623 %}
 4624 
 4625 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4626   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4627   match(Set dst (VectorInsert (Binary src val) idx));
 4628   effect(TEMP vtmp);
 4629   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4630   ins_encode %{
 4631     int vlen_enc = Assembler::AVX_256bit;
 4632     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4633     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4634     int log2epr = log2(elem_per_lane);
 4635 
 4636     assert(is_integral_type(elem_bt), "sanity");
 4637     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4638 
 4639     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4640     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4641     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4642     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4643     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4644   %}
 4645   ins_pipe( pipe_slow );
 4646 %}
 4647 
 4648 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4649   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4650   match(Set dst (VectorInsert (Binary src val) idx));
 4651   effect(TEMP vtmp);
 4652   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4653   ins_encode %{
 4654     assert(UseAVX > 2, "sanity");
 4655 
 4656     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4657     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4658     int log2epr = log2(elem_per_lane);
 4659 
 4660     assert(is_integral_type(elem_bt), "");
 4661     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4662 
 4663     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4664     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4665     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4666     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4667     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4668   %}
 4669   ins_pipe( pipe_slow );
 4670 %}
 4671 
 4672 #ifdef _LP64
 4673 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4674   predicate(Matcher::vector_length(n) == 2);
 4675   match(Set dst (VectorInsert (Binary dst val) idx));
 4676   format %{ "vector_insert $dst,$val,$idx" %}
 4677   ins_encode %{
 4678     assert(UseSSE >= 4, "required");
 4679     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4680     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4681 
 4682     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4683   %}
 4684   ins_pipe( pipe_slow );
 4685 %}
 4686 
 4687 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4688   predicate(Matcher::vector_length(n) == 4);
 4689   match(Set dst (VectorInsert (Binary src val) idx));
 4690   effect(TEMP vtmp);
 4691   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4692   ins_encode %{
 4693     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4694     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4695 
 4696     uint x_idx = $idx$$constant & right_n_bits(1);
 4697     uint y_idx = ($idx$$constant >> 1) & 1;
 4698     int vlen_enc = Assembler::AVX_256bit;
 4699     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4700     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4701     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4702   %}
 4703   ins_pipe( pipe_slow );
 4704 %}
 4705 
 4706 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4707   predicate(Matcher::vector_length(n) == 8);
 4708   match(Set dst (VectorInsert (Binary src val) idx));
 4709   effect(TEMP vtmp);
 4710   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4711   ins_encode %{
 4712     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4713     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4714 
 4715     uint x_idx = $idx$$constant & right_n_bits(1);
 4716     uint y_idx = ($idx$$constant >> 1) & 3;
 4717     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4718     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4719     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4720   %}
 4721   ins_pipe( pipe_slow );
 4722 %}
 4723 #endif
 4724 
 4725 instruct insertF(vec dst, regF val, immU8 idx) %{
 4726   predicate(Matcher::vector_length(n) < 8);
 4727   match(Set dst (VectorInsert (Binary dst val) idx));
 4728   format %{ "vector_insert $dst,$val,$idx" %}
 4729   ins_encode %{
 4730     assert(UseSSE >= 4, "sanity");
 4731 
 4732     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4733     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4734 
 4735     uint x_idx = $idx$$constant & right_n_bits(2);
 4736     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4737   %}
 4738   ins_pipe( pipe_slow );
 4739 %}
 4740 
 4741 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4742   predicate(Matcher::vector_length(n) >= 8);
 4743   match(Set dst (VectorInsert (Binary src val) idx));
 4744   effect(TEMP vtmp);
 4745   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4746   ins_encode %{
 4747     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4748     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4749 
 4750     int vlen = Matcher::vector_length(this);
 4751     uint x_idx = $idx$$constant & right_n_bits(2);
 4752     if (vlen == 8) {
 4753       uint y_idx = ($idx$$constant >> 2) & 1;
 4754       int vlen_enc = Assembler::AVX_256bit;
 4755       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4756       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4757       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4758     } else {
 4759       assert(vlen == 16, "sanity");
 4760       uint y_idx = ($idx$$constant >> 2) & 3;
 4761       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4762       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4763       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4764     }
 4765   %}
 4766   ins_pipe( pipe_slow );
 4767 %}
 4768 
 4769 #ifdef _LP64
 4770 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4771   predicate(Matcher::vector_length(n) == 2);
 4772   match(Set dst (VectorInsert (Binary dst val) idx));
 4773   effect(TEMP tmp);
 4774   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4775   ins_encode %{
 4776     assert(UseSSE >= 4, "sanity");
 4777     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4778     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4779 
 4780     __ movq($tmp$$Register, $val$$XMMRegister);
 4781     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4782   %}
 4783   ins_pipe( pipe_slow );
 4784 %}
 4785 
 4786 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4787   predicate(Matcher::vector_length(n) == 4);
 4788   match(Set dst (VectorInsert (Binary src val) idx));
 4789   effect(TEMP vtmp, TEMP tmp);
 4790   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4791   ins_encode %{
 4792     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4793     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4794 
 4795     uint x_idx = $idx$$constant & right_n_bits(1);
 4796     uint y_idx = ($idx$$constant >> 1) & 1;
 4797     int vlen_enc = Assembler::AVX_256bit;
 4798     __ movq($tmp$$Register, $val$$XMMRegister);
 4799     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4800     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4801     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4802   %}
 4803   ins_pipe( pipe_slow );
 4804 %}
 4805 
 4806 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4807   predicate(Matcher::vector_length(n) == 8);
 4808   match(Set dst (VectorInsert (Binary src val) idx));
 4809   effect(TEMP tmp, TEMP vtmp);
 4810   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4811   ins_encode %{
 4812     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4813     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4814 
 4815     uint x_idx = $idx$$constant & right_n_bits(1);
 4816     uint y_idx = ($idx$$constant >> 1) & 3;
 4817     __ movq($tmp$$Register, $val$$XMMRegister);
 4818     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4819     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4820     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4821   %}
 4822   ins_pipe( pipe_slow );
 4823 %}
 4824 #endif
 4825 
 4826 // ====================REDUCTION ARITHMETIC=======================================
 4827 
 4828 // =======================Int Reduction==========================================
 4829 
 4830 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4831   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4832   match(Set dst (AddReductionVI src1 src2));
 4833   match(Set dst (MulReductionVI src1 src2));
 4834   match(Set dst (AndReductionV  src1 src2));
 4835   match(Set dst ( OrReductionV  src1 src2));
 4836   match(Set dst (XorReductionV  src1 src2));
 4837   match(Set dst (MinReductionV  src1 src2));
 4838   match(Set dst (MaxReductionV  src1 src2));
 4839   effect(TEMP vtmp1, TEMP vtmp2);
 4840   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4841   ins_encode %{
 4842     int opcode = this->ideal_Opcode();
 4843     int vlen = Matcher::vector_length(this, $src2);
 4844     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4845   %}
 4846   ins_pipe( pipe_slow );
 4847 %}
 4848 
 4849 // =======================Long Reduction==========================================
 4850 
 4851 #ifdef _LP64
 4852 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4853   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4854   match(Set dst (AddReductionVL src1 src2));
 4855   match(Set dst (MulReductionVL src1 src2));
 4856   match(Set dst (AndReductionV  src1 src2));
 4857   match(Set dst ( OrReductionV  src1 src2));
 4858   match(Set dst (XorReductionV  src1 src2));
 4859   match(Set dst (MinReductionV  src1 src2));
 4860   match(Set dst (MaxReductionV  src1 src2));
 4861   effect(TEMP vtmp1, TEMP vtmp2);
 4862   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4863   ins_encode %{
 4864     int opcode = this->ideal_Opcode();
 4865     int vlen = Matcher::vector_length(this, $src2);
 4866     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4867   %}
 4868   ins_pipe( pipe_slow );
 4869 %}
 4870 
 4871 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4872   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4873   match(Set dst (AddReductionVL src1 src2));
 4874   match(Set dst (MulReductionVL src1 src2));
 4875   match(Set dst (AndReductionV  src1 src2));
 4876   match(Set dst ( OrReductionV  src1 src2));
 4877   match(Set dst (XorReductionV  src1 src2));
 4878   match(Set dst (MinReductionV  src1 src2));
 4879   match(Set dst (MaxReductionV  src1 src2));
 4880   effect(TEMP vtmp1, TEMP vtmp2);
 4881   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4882   ins_encode %{
 4883     int opcode = this->ideal_Opcode();
 4884     int vlen = Matcher::vector_length(this, $src2);
 4885     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4886   %}
 4887   ins_pipe( pipe_slow );
 4888 %}
 4889 #endif // _LP64
 4890 
 4891 // =======================Float Reduction==========================================
 4892 
 4893 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4894   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4895   match(Set dst (AddReductionVF dst src));
 4896   match(Set dst (MulReductionVF dst src));
 4897   effect(TEMP dst, TEMP vtmp);
 4898   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4899   ins_encode %{
 4900     int opcode = this->ideal_Opcode();
 4901     int vlen = Matcher::vector_length(this, $src);
 4902     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4903   %}
 4904   ins_pipe( pipe_slow );
 4905 %}
 4906 
 4907 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4908   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4909   match(Set dst (AddReductionVF dst src));
 4910   match(Set dst (MulReductionVF dst src));
 4911   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4912   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4913   ins_encode %{
 4914     int opcode = this->ideal_Opcode();
 4915     int vlen = Matcher::vector_length(this, $src);
 4916     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4917   %}
 4918   ins_pipe( pipe_slow );
 4919 %}
 4920 
 4921 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4922   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4923   match(Set dst (AddReductionVF dst src));
 4924   match(Set dst (MulReductionVF dst src));
 4925   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4926   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4927   ins_encode %{
 4928     int opcode = this->ideal_Opcode();
 4929     int vlen = Matcher::vector_length(this, $src);
 4930     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4931   %}
 4932   ins_pipe( pipe_slow );
 4933 %}
 4934 
 4935 // =======================Double Reduction==========================================
 4936 
 4937 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4938   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 4939   match(Set dst (AddReductionVD dst src));
 4940   match(Set dst (MulReductionVD dst src));
 4941   effect(TEMP dst, TEMP vtmp);
 4942   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 4943   ins_encode %{
 4944     int opcode = this->ideal_Opcode();
 4945     int vlen = Matcher::vector_length(this, $src);
 4946     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4947 %}
 4948   ins_pipe( pipe_slow );
 4949 %}
 4950 
 4951 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 4952   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 4953   match(Set dst (AddReductionVD dst src));
 4954   match(Set dst (MulReductionVD dst src));
 4955   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4956   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4957   ins_encode %{
 4958     int opcode = this->ideal_Opcode();
 4959     int vlen = Matcher::vector_length(this, $src);
 4960     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4961   %}
 4962   ins_pipe( pipe_slow );
 4963 %}
 4964 
 4965 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4966   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4967   match(Set dst (AddReductionVD dst src));
 4968   match(Set dst (MulReductionVD dst src));
 4969   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4970   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4971   ins_encode %{
 4972     int opcode = this->ideal_Opcode();
 4973     int vlen = Matcher::vector_length(this, $src);
 4974     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4975   %}
 4976   ins_pipe( pipe_slow );
 4977 %}
 4978 
 4979 // =======================Byte Reduction==========================================
 4980 
 4981 #ifdef _LP64
 4982 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4983   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 4984   match(Set dst (AddReductionVI src1 src2));
 4985   match(Set dst (AndReductionV  src1 src2));
 4986   match(Set dst ( OrReductionV  src1 src2));
 4987   match(Set dst (XorReductionV  src1 src2));
 4988   match(Set dst (MinReductionV  src1 src2));
 4989   match(Set dst (MaxReductionV  src1 src2));
 4990   effect(TEMP vtmp1, TEMP vtmp2);
 4991   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4992   ins_encode %{
 4993     int opcode = this->ideal_Opcode();
 4994     int vlen = Matcher::vector_length(this, $src2);
 4995     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4996   %}
 4997   ins_pipe( pipe_slow );
 4998 %}
 4999 
 5000 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5001   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5002   match(Set dst (AddReductionVI src1 src2));
 5003   match(Set dst (AndReductionV  src1 src2));
 5004   match(Set dst ( OrReductionV  src1 src2));
 5005   match(Set dst (XorReductionV  src1 src2));
 5006   match(Set dst (MinReductionV  src1 src2));
 5007   match(Set dst (MaxReductionV  src1 src2));
 5008   effect(TEMP vtmp1, TEMP vtmp2);
 5009   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5010   ins_encode %{
 5011     int opcode = this->ideal_Opcode();
 5012     int vlen = Matcher::vector_length(this, $src2);
 5013     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5014   %}
 5015   ins_pipe( pipe_slow );
 5016 %}
 5017 #endif
 5018 
 5019 // =======================Short Reduction==========================================
 5020 
 5021 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5022   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5023   match(Set dst (AddReductionVI src1 src2));
 5024   match(Set dst (MulReductionVI src1 src2));
 5025   match(Set dst (AndReductionV  src1 src2));
 5026   match(Set dst ( OrReductionV  src1 src2));
 5027   match(Set dst (XorReductionV  src1 src2));
 5028   match(Set dst (MinReductionV  src1 src2));
 5029   match(Set dst (MaxReductionV  src1 src2));
 5030   effect(TEMP vtmp1, TEMP vtmp2);
 5031   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5032   ins_encode %{
 5033     int opcode = this->ideal_Opcode();
 5034     int vlen = Matcher::vector_length(this, $src2);
 5035     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5036   %}
 5037   ins_pipe( pipe_slow );
 5038 %}
 5039 
 5040 // =======================Mul Reduction==========================================
 5041 
 5042 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5043   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5044             Matcher::vector_length(n->in(2)) <= 32); // src2
 5045   match(Set dst (MulReductionVI src1 src2));
 5046   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5047   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5048   ins_encode %{
 5049     int opcode = this->ideal_Opcode();
 5050     int vlen = Matcher::vector_length(this, $src2);
 5051     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5052   %}
 5053   ins_pipe( pipe_slow );
 5054 %}
 5055 
 5056 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5057   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5058             Matcher::vector_length(n->in(2)) == 64); // src2
 5059   match(Set dst (MulReductionVI src1 src2));
 5060   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5061   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5062   ins_encode %{
 5063     int opcode = this->ideal_Opcode();
 5064     int vlen = Matcher::vector_length(this, $src2);
 5065     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5066   %}
 5067   ins_pipe( pipe_slow );
 5068 %}
 5069 
 5070 //--------------------Min/Max Float Reduction --------------------
 5071 // Float Min Reduction
 5072 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5073                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5074   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5075             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5076              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5077             Matcher::vector_length(n->in(2)) == 2);
 5078   match(Set dst (MinReductionV src1 src2));
 5079   match(Set dst (MaxReductionV src1 src2));
 5080   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5081   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5082   ins_encode %{
 5083     assert(UseAVX > 0, "sanity");
 5084 
 5085     int opcode = this->ideal_Opcode();
 5086     int vlen = Matcher::vector_length(this, $src2);
 5087     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5088                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5089   %}
 5090   ins_pipe( pipe_slow );
 5091 %}
 5092 
 5093 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5094                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5095   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5096             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5097              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5098             Matcher::vector_length(n->in(2)) >= 4);
 5099   match(Set dst (MinReductionV src1 src2));
 5100   match(Set dst (MaxReductionV src1 src2));
 5101   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5102   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5103   ins_encode %{
 5104     assert(UseAVX > 0, "sanity");
 5105 
 5106     int opcode = this->ideal_Opcode();
 5107     int vlen = Matcher::vector_length(this, $src2);
 5108     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5109                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5110   %}
 5111   ins_pipe( pipe_slow );
 5112 %}
 5113 
 5114 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5115                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5116   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5117             Matcher::vector_length(n->in(2)) == 2);
 5118   match(Set dst (MinReductionV dst src));
 5119   match(Set dst (MaxReductionV dst src));
 5120   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5121   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5122   ins_encode %{
 5123     assert(UseAVX > 0, "sanity");
 5124 
 5125     int opcode = this->ideal_Opcode();
 5126     int vlen = Matcher::vector_length(this, $src);
 5127     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5128                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5129   %}
 5130   ins_pipe( pipe_slow );
 5131 %}
 5132 
 5133 
 5134 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5135                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5136   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5137             Matcher::vector_length(n->in(2)) >= 4);
 5138   match(Set dst (MinReductionV dst src));
 5139   match(Set dst (MaxReductionV dst src));
 5140   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5141   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5142   ins_encode %{
 5143     assert(UseAVX > 0, "sanity");
 5144 
 5145     int opcode = this->ideal_Opcode();
 5146     int vlen = Matcher::vector_length(this, $src);
 5147     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5148                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5149   %}
 5150   ins_pipe( pipe_slow );
 5151 %}
 5152 
 5153 
 5154 //--------------------Min Double Reduction --------------------
 5155 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5156                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5157                             rFlagsReg cr) %{
 5158   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5159             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5160              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5161             Matcher::vector_length(n->in(2)) == 2);
 5162   match(Set dst (MinReductionV src1 src2));
 5163   match(Set dst (MaxReductionV src1 src2));
 5164   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5165   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5166   ins_encode %{
 5167     assert(UseAVX > 0, "sanity");
 5168 
 5169     int opcode = this->ideal_Opcode();
 5170     int vlen = Matcher::vector_length(this, $src2);
 5171     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5172                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5173   %}
 5174   ins_pipe( pipe_slow );
 5175 %}
 5176 
 5177 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5178                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5179                            rFlagsReg cr) %{
 5180   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5181             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5182              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5183             Matcher::vector_length(n->in(2)) >= 4);
 5184   match(Set dst (MinReductionV src1 src2));
 5185   match(Set dst (MaxReductionV src1 src2));
 5186   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5187   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5188   ins_encode %{
 5189     assert(UseAVX > 0, "sanity");
 5190 
 5191     int opcode = this->ideal_Opcode();
 5192     int vlen = Matcher::vector_length(this, $src2);
 5193     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5194                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5195   %}
 5196   ins_pipe( pipe_slow );
 5197 %}
 5198 
 5199 
 5200 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5201                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5202                                rFlagsReg cr) %{
 5203   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5204             Matcher::vector_length(n->in(2)) == 2);
 5205   match(Set dst (MinReductionV dst src));
 5206   match(Set dst (MaxReductionV dst src));
 5207   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5208   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5209   ins_encode %{
 5210     assert(UseAVX > 0, "sanity");
 5211 
 5212     int opcode = this->ideal_Opcode();
 5213     int vlen = Matcher::vector_length(this, $src);
 5214     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5215                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5216   %}
 5217   ins_pipe( pipe_slow );
 5218 %}
 5219 
 5220 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5221                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5222                               rFlagsReg cr) %{
 5223   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5224             Matcher::vector_length(n->in(2)) >= 4);
 5225   match(Set dst (MinReductionV dst src));
 5226   match(Set dst (MaxReductionV dst src));
 5227   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5228   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5229   ins_encode %{
 5230     assert(UseAVX > 0, "sanity");
 5231 
 5232     int opcode = this->ideal_Opcode();
 5233     int vlen = Matcher::vector_length(this, $src);
 5234     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5235                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5236   %}
 5237   ins_pipe( pipe_slow );
 5238 %}
 5239 
 5240 // ====================VECTOR ARITHMETIC=======================================
 5241 
 5242 // --------------------------------- ADD --------------------------------------
 5243 
 5244 // Bytes vector add
 5245 instruct vaddB(vec dst, vec src) %{
 5246   predicate(UseAVX == 0);
 5247   match(Set dst (AddVB dst src));
 5248   format %{ "paddb   $dst,$src\t! add packedB" %}
 5249   ins_encode %{
 5250     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5251   %}
 5252   ins_pipe( pipe_slow );
 5253 %}
 5254 
 5255 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5256   predicate(UseAVX > 0);
 5257   match(Set dst (AddVB src1 src2));
 5258   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5259   ins_encode %{
 5260     int vlen_enc = vector_length_encoding(this);
 5261     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5262   %}
 5263   ins_pipe( pipe_slow );
 5264 %}
 5265 
 5266 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5267   predicate((UseAVX > 0) &&
 5268             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5269   match(Set dst (AddVB src (LoadVector mem)));
 5270   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5271   ins_encode %{
 5272     int vlen_enc = vector_length_encoding(this);
 5273     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5274   %}
 5275   ins_pipe( pipe_slow );
 5276 %}
 5277 
 5278 // Shorts/Chars vector add
 5279 instruct vaddS(vec dst, vec src) %{
 5280   predicate(UseAVX == 0);
 5281   match(Set dst (AddVS dst src));
 5282   format %{ "paddw   $dst,$src\t! add packedS" %}
 5283   ins_encode %{
 5284     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5285   %}
 5286   ins_pipe( pipe_slow );
 5287 %}
 5288 
 5289 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5290   predicate(UseAVX > 0);
 5291   match(Set dst (AddVS src1 src2));
 5292   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5293   ins_encode %{
 5294     int vlen_enc = vector_length_encoding(this);
 5295     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5296   %}
 5297   ins_pipe( pipe_slow );
 5298 %}
 5299 
 5300 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5301   predicate((UseAVX > 0) &&
 5302             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5303   match(Set dst (AddVS src (LoadVector mem)));
 5304   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5305   ins_encode %{
 5306     int vlen_enc = vector_length_encoding(this);
 5307     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5308   %}
 5309   ins_pipe( pipe_slow );
 5310 %}
 5311 
 5312 // Integers vector add
 5313 instruct vaddI(vec dst, vec src) %{
 5314   predicate(UseAVX == 0);
 5315   match(Set dst (AddVI dst src));
 5316   format %{ "paddd   $dst,$src\t! add packedI" %}
 5317   ins_encode %{
 5318     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5319   %}
 5320   ins_pipe( pipe_slow );
 5321 %}
 5322 
 5323 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5324   predicate(UseAVX > 0);
 5325   match(Set dst (AddVI src1 src2));
 5326   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5327   ins_encode %{
 5328     int vlen_enc = vector_length_encoding(this);
 5329     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5330   %}
 5331   ins_pipe( pipe_slow );
 5332 %}
 5333 
 5334 
 5335 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5336   predicate((UseAVX > 0) &&
 5337             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5338   match(Set dst (AddVI src (LoadVector mem)));
 5339   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5340   ins_encode %{
 5341     int vlen_enc = vector_length_encoding(this);
 5342     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5343   %}
 5344   ins_pipe( pipe_slow );
 5345 %}
 5346 
 5347 // Longs vector add
 5348 instruct vaddL(vec dst, vec src) %{
 5349   predicate(UseAVX == 0);
 5350   match(Set dst (AddVL dst src));
 5351   format %{ "paddq   $dst,$src\t! add packedL" %}
 5352   ins_encode %{
 5353     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5354   %}
 5355   ins_pipe( pipe_slow );
 5356 %}
 5357 
 5358 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5359   predicate(UseAVX > 0);
 5360   match(Set dst (AddVL src1 src2));
 5361   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5362   ins_encode %{
 5363     int vlen_enc = vector_length_encoding(this);
 5364     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5365   %}
 5366   ins_pipe( pipe_slow );
 5367 %}
 5368 
 5369 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5370   predicate((UseAVX > 0) &&
 5371             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5372   match(Set dst (AddVL src (LoadVector mem)));
 5373   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5374   ins_encode %{
 5375     int vlen_enc = vector_length_encoding(this);
 5376     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5377   %}
 5378   ins_pipe( pipe_slow );
 5379 %}
 5380 
 5381 // Floats vector add
 5382 instruct vaddF(vec dst, vec src) %{
 5383   predicate(UseAVX == 0);
 5384   match(Set dst (AddVF dst src));
 5385   format %{ "addps   $dst,$src\t! add packedF" %}
 5386   ins_encode %{
 5387     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5388   %}
 5389   ins_pipe( pipe_slow );
 5390 %}
 5391 
 5392 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5393   predicate(UseAVX > 0);
 5394   match(Set dst (AddVF src1 src2));
 5395   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5396   ins_encode %{
 5397     int vlen_enc = vector_length_encoding(this);
 5398     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5399   %}
 5400   ins_pipe( pipe_slow );
 5401 %}
 5402 
 5403 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5404   predicate((UseAVX > 0) &&
 5405             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5406   match(Set dst (AddVF src (LoadVector mem)));
 5407   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5408   ins_encode %{
 5409     int vlen_enc = vector_length_encoding(this);
 5410     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5411   %}
 5412   ins_pipe( pipe_slow );
 5413 %}
 5414 
 5415 // Doubles vector add
 5416 instruct vaddD(vec dst, vec src) %{
 5417   predicate(UseAVX == 0);
 5418   match(Set dst (AddVD dst src));
 5419   format %{ "addpd   $dst,$src\t! add packedD" %}
 5420   ins_encode %{
 5421     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5422   %}
 5423   ins_pipe( pipe_slow );
 5424 %}
 5425 
 5426 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5427   predicate(UseAVX > 0);
 5428   match(Set dst (AddVD src1 src2));
 5429   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5430   ins_encode %{
 5431     int vlen_enc = vector_length_encoding(this);
 5432     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5433   %}
 5434   ins_pipe( pipe_slow );
 5435 %}
 5436 
 5437 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5438   predicate((UseAVX > 0) &&
 5439             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5440   match(Set dst (AddVD src (LoadVector mem)));
 5441   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5442   ins_encode %{
 5443     int vlen_enc = vector_length_encoding(this);
 5444     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5445   %}
 5446   ins_pipe( pipe_slow );
 5447 %}
 5448 
 5449 // --------------------------------- SUB --------------------------------------
 5450 
 5451 // Bytes vector sub
 5452 instruct vsubB(vec dst, vec src) %{
 5453   predicate(UseAVX == 0);
 5454   match(Set dst (SubVB dst src));
 5455   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5456   ins_encode %{
 5457     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5458   %}
 5459   ins_pipe( pipe_slow );
 5460 %}
 5461 
 5462 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5463   predicate(UseAVX > 0);
 5464   match(Set dst (SubVB src1 src2));
 5465   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5466   ins_encode %{
 5467     int vlen_enc = vector_length_encoding(this);
 5468     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5469   %}
 5470   ins_pipe( pipe_slow );
 5471 %}
 5472 
 5473 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5474   predicate((UseAVX > 0) &&
 5475             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5476   match(Set dst (SubVB src (LoadVector mem)));
 5477   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5478   ins_encode %{
 5479     int vlen_enc = vector_length_encoding(this);
 5480     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5481   %}
 5482   ins_pipe( pipe_slow );
 5483 %}
 5484 
 5485 // Shorts/Chars vector sub
 5486 instruct vsubS(vec dst, vec src) %{
 5487   predicate(UseAVX == 0);
 5488   match(Set dst (SubVS dst src));
 5489   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5490   ins_encode %{
 5491     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5492   %}
 5493   ins_pipe( pipe_slow );
 5494 %}
 5495 
 5496 
 5497 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5498   predicate(UseAVX > 0);
 5499   match(Set dst (SubVS src1 src2));
 5500   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5501   ins_encode %{
 5502     int vlen_enc = vector_length_encoding(this);
 5503     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5504   %}
 5505   ins_pipe( pipe_slow );
 5506 %}
 5507 
 5508 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5509   predicate((UseAVX > 0) &&
 5510             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5511   match(Set dst (SubVS src (LoadVector mem)));
 5512   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5513   ins_encode %{
 5514     int vlen_enc = vector_length_encoding(this);
 5515     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5516   %}
 5517   ins_pipe( pipe_slow );
 5518 %}
 5519 
 5520 // Integers vector sub
 5521 instruct vsubI(vec dst, vec src) %{
 5522   predicate(UseAVX == 0);
 5523   match(Set dst (SubVI dst src));
 5524   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5525   ins_encode %{
 5526     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5527   %}
 5528   ins_pipe( pipe_slow );
 5529 %}
 5530 
 5531 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5532   predicate(UseAVX > 0);
 5533   match(Set dst (SubVI src1 src2));
 5534   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5535   ins_encode %{
 5536     int vlen_enc = vector_length_encoding(this);
 5537     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5538   %}
 5539   ins_pipe( pipe_slow );
 5540 %}
 5541 
 5542 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5543   predicate((UseAVX > 0) &&
 5544             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5545   match(Set dst (SubVI src (LoadVector mem)));
 5546   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5547   ins_encode %{
 5548     int vlen_enc = vector_length_encoding(this);
 5549     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5550   %}
 5551   ins_pipe( pipe_slow );
 5552 %}
 5553 
 5554 // Longs vector sub
 5555 instruct vsubL(vec dst, vec src) %{
 5556   predicate(UseAVX == 0);
 5557   match(Set dst (SubVL dst src));
 5558   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5559   ins_encode %{
 5560     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5561   %}
 5562   ins_pipe( pipe_slow );
 5563 %}
 5564 
 5565 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5566   predicate(UseAVX > 0);
 5567   match(Set dst (SubVL src1 src2));
 5568   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5569   ins_encode %{
 5570     int vlen_enc = vector_length_encoding(this);
 5571     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5572   %}
 5573   ins_pipe( pipe_slow );
 5574 %}
 5575 
 5576 
 5577 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5578   predicate((UseAVX > 0) &&
 5579             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5580   match(Set dst (SubVL src (LoadVector mem)));
 5581   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5582   ins_encode %{
 5583     int vlen_enc = vector_length_encoding(this);
 5584     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5585   %}
 5586   ins_pipe( pipe_slow );
 5587 %}
 5588 
 5589 // Floats vector sub
 5590 instruct vsubF(vec dst, vec src) %{
 5591   predicate(UseAVX == 0);
 5592   match(Set dst (SubVF dst src));
 5593   format %{ "subps   $dst,$src\t! sub packedF" %}
 5594   ins_encode %{
 5595     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5596   %}
 5597   ins_pipe( pipe_slow );
 5598 %}
 5599 
 5600 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5601   predicate(UseAVX > 0);
 5602   match(Set dst (SubVF src1 src2));
 5603   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5604   ins_encode %{
 5605     int vlen_enc = vector_length_encoding(this);
 5606     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5607   %}
 5608   ins_pipe( pipe_slow );
 5609 %}
 5610 
 5611 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5612   predicate((UseAVX > 0) &&
 5613             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5614   match(Set dst (SubVF src (LoadVector mem)));
 5615   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5616   ins_encode %{
 5617     int vlen_enc = vector_length_encoding(this);
 5618     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5619   %}
 5620   ins_pipe( pipe_slow );
 5621 %}
 5622 
 5623 // Doubles vector sub
 5624 instruct vsubD(vec dst, vec src) %{
 5625   predicate(UseAVX == 0);
 5626   match(Set dst (SubVD dst src));
 5627   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5628   ins_encode %{
 5629     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5630   %}
 5631   ins_pipe( pipe_slow );
 5632 %}
 5633 
 5634 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5635   predicate(UseAVX > 0);
 5636   match(Set dst (SubVD src1 src2));
 5637   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5638   ins_encode %{
 5639     int vlen_enc = vector_length_encoding(this);
 5640     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5641   %}
 5642   ins_pipe( pipe_slow );
 5643 %}
 5644 
 5645 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5646   predicate((UseAVX > 0) &&
 5647             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5648   match(Set dst (SubVD src (LoadVector mem)));
 5649   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5650   ins_encode %{
 5651     int vlen_enc = vector_length_encoding(this);
 5652     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5653   %}
 5654   ins_pipe( pipe_slow );
 5655 %}
 5656 
 5657 // --------------------------------- MUL --------------------------------------
 5658 
 5659 // Byte vector mul
 5660 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5661   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5662   match(Set dst (MulVB src1 src2));
 5663   effect(TEMP dst, TEMP xtmp);
 5664   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5665   ins_encode %{
 5666     assert(UseSSE > 3, "required");
 5667     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5668     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5669     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5670     __ psllw($dst$$XMMRegister, 8);
 5671     __ psrlw($dst$$XMMRegister, 8);
 5672     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5673   %}
 5674   ins_pipe( pipe_slow );
 5675 %}
 5676 
 5677 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5678   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5679   match(Set dst (MulVB src1 src2));
 5680   effect(TEMP dst, TEMP xtmp);
 5681   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5682   ins_encode %{
 5683     assert(UseSSE > 3, "required");
 5684     // Odd-index elements
 5685     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5686     __ psrlw($dst$$XMMRegister, 8);
 5687     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5688     __ psrlw($xtmp$$XMMRegister, 8);
 5689     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5690     __ psllw($dst$$XMMRegister, 8);
 5691     // Even-index elements
 5692     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5693     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5694     __ psllw($xtmp$$XMMRegister, 8);
 5695     __ psrlw($xtmp$$XMMRegister, 8);
 5696     // Combine
 5697     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5698   %}
 5699   ins_pipe( pipe_slow );
 5700 %}
 5701 
 5702 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5703   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5704   match(Set dst (MulVB src1 src2));
 5705   effect(TEMP xtmp1, TEMP xtmp2);
 5706   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5707   ins_encode %{
 5708     int vlen_enc = vector_length_encoding(this);
 5709     // Odd-index elements
 5710     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5711     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5712     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5713     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5714     // Even-index elements
 5715     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5716     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5717     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5718     // Combine
 5719     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5720   %}
 5721   ins_pipe( pipe_slow );
 5722 %}
 5723 
 5724 // Shorts/Chars vector mul
 5725 instruct vmulS(vec dst, vec src) %{
 5726   predicate(UseAVX == 0);
 5727   match(Set dst (MulVS dst src));
 5728   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5729   ins_encode %{
 5730     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5731   %}
 5732   ins_pipe( pipe_slow );
 5733 %}
 5734 
 5735 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5736   predicate(UseAVX > 0);
 5737   match(Set dst (MulVS src1 src2));
 5738   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5739   ins_encode %{
 5740     int vlen_enc = vector_length_encoding(this);
 5741     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5742   %}
 5743   ins_pipe( pipe_slow );
 5744 %}
 5745 
 5746 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5747   predicate((UseAVX > 0) &&
 5748             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5749   match(Set dst (MulVS src (LoadVector mem)));
 5750   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5751   ins_encode %{
 5752     int vlen_enc = vector_length_encoding(this);
 5753     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5754   %}
 5755   ins_pipe( pipe_slow );
 5756 %}
 5757 
 5758 // Integers vector mul
 5759 instruct vmulI(vec dst, vec src) %{
 5760   predicate(UseAVX == 0);
 5761   match(Set dst (MulVI dst src));
 5762   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5763   ins_encode %{
 5764     assert(UseSSE > 3, "required");
 5765     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5766   %}
 5767   ins_pipe( pipe_slow );
 5768 %}
 5769 
 5770 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5771   predicate(UseAVX > 0);
 5772   match(Set dst (MulVI src1 src2));
 5773   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5774   ins_encode %{
 5775     int vlen_enc = vector_length_encoding(this);
 5776     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5777   %}
 5778   ins_pipe( pipe_slow );
 5779 %}
 5780 
 5781 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5782   predicate((UseAVX > 0) &&
 5783             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5784   match(Set dst (MulVI src (LoadVector mem)));
 5785   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5786   ins_encode %{
 5787     int vlen_enc = vector_length_encoding(this);
 5788     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5789   %}
 5790   ins_pipe( pipe_slow );
 5791 %}
 5792 
 5793 // Longs vector mul
 5794 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 5795   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5796              VM_Version::supports_avx512dq()) ||
 5797             VM_Version::supports_avx512vldq());
 5798   match(Set dst (MulVL src1 src2));
 5799   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 5800   ins_encode %{
 5801     assert(UseAVX > 2, "required");
 5802     int vlen_enc = vector_length_encoding(this);
 5803     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5804   %}
 5805   ins_pipe( pipe_slow );
 5806 %}
 5807 
 5808 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 5809   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5810              VM_Version::supports_avx512dq()) ||
 5811             (Matcher::vector_length_in_bytes(n) > 8 &&
 5812              VM_Version::supports_avx512vldq()));
 5813   match(Set dst (MulVL src (LoadVector mem)));
 5814   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 5815   ins_encode %{
 5816     assert(UseAVX > 2, "required");
 5817     int vlen_enc = vector_length_encoding(this);
 5818     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5819   %}
 5820   ins_pipe( pipe_slow );
 5821 %}
 5822 
 5823 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 5824   predicate(UseAVX == 0);
 5825   match(Set dst (MulVL src1 src2));
 5826   effect(TEMP dst, TEMP xtmp);
 5827   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5828   ins_encode %{
 5829     assert(VM_Version::supports_sse4_1(), "required");
 5830     // Get the lo-hi products, only the lower 32 bits is in concerns
 5831     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 5832     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 5833     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 5834     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 5835     __ psllq($dst$$XMMRegister, 32);
 5836     // Get the lo-lo products
 5837     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5838     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 5839     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 5840   %}
 5841   ins_pipe( pipe_slow );
 5842 %}
 5843 
 5844 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5845   predicate(UseAVX > 0 &&
 5846             ((Matcher::vector_length_in_bytes(n) == 64 &&
 5847               !VM_Version::supports_avx512dq()) ||
 5848              (Matcher::vector_length_in_bytes(n) < 64 &&
 5849               !VM_Version::supports_avx512vldq())));
 5850   match(Set dst (MulVL src1 src2));
 5851   effect(TEMP xtmp1, TEMP xtmp2);
 5852   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5853   ins_encode %{
 5854     int vlen_enc = vector_length_encoding(this);
 5855     // Get the lo-hi products, only the lower 32 bits is in concerns
 5856     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 5857     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5858     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 5859     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5860     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 5861     // Get the lo-lo products
 5862     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5863     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5864   %}
 5865   ins_pipe( pipe_slow );
 5866 %}
 5867 
 5868 // Floats vector mul
 5869 instruct vmulF(vec dst, vec src) %{
 5870   predicate(UseAVX == 0);
 5871   match(Set dst (MulVF dst src));
 5872   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5873   ins_encode %{
 5874     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5875   %}
 5876   ins_pipe( pipe_slow );
 5877 %}
 5878 
 5879 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5880   predicate(UseAVX > 0);
 5881   match(Set dst (MulVF src1 src2));
 5882   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5883   ins_encode %{
 5884     int vlen_enc = vector_length_encoding(this);
 5885     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5886   %}
 5887   ins_pipe( pipe_slow );
 5888 %}
 5889 
 5890 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5891   predicate((UseAVX > 0) &&
 5892             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5893   match(Set dst (MulVF src (LoadVector mem)));
 5894   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5895   ins_encode %{
 5896     int vlen_enc = vector_length_encoding(this);
 5897     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5898   %}
 5899   ins_pipe( pipe_slow );
 5900 %}
 5901 
 5902 // Doubles vector mul
 5903 instruct vmulD(vec dst, vec src) %{
 5904   predicate(UseAVX == 0);
 5905   match(Set dst (MulVD dst src));
 5906   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5907   ins_encode %{
 5908     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5909   %}
 5910   ins_pipe( pipe_slow );
 5911 %}
 5912 
 5913 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5914   predicate(UseAVX > 0);
 5915   match(Set dst (MulVD src1 src2));
 5916   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5917   ins_encode %{
 5918     int vlen_enc = vector_length_encoding(this);
 5919     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5920   %}
 5921   ins_pipe( pipe_slow );
 5922 %}
 5923 
 5924 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5925   predicate((UseAVX > 0) &&
 5926             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5927   match(Set dst (MulVD src (LoadVector mem)));
 5928   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 5929   ins_encode %{
 5930     int vlen_enc = vector_length_encoding(this);
 5931     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5932   %}
 5933   ins_pipe( pipe_slow );
 5934 %}
 5935 
 5936 // --------------------------------- DIV --------------------------------------
 5937 
 5938 // Floats vector div
 5939 instruct vdivF(vec dst, vec src) %{
 5940   predicate(UseAVX == 0);
 5941   match(Set dst (DivVF dst src));
 5942   format %{ "divps   $dst,$src\t! div packedF" %}
 5943   ins_encode %{
 5944     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 5945   %}
 5946   ins_pipe( pipe_slow );
 5947 %}
 5948 
 5949 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 5950   predicate(UseAVX > 0);
 5951   match(Set dst (DivVF src1 src2));
 5952   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 5953   ins_encode %{
 5954     int vlen_enc = vector_length_encoding(this);
 5955     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5956   %}
 5957   ins_pipe( pipe_slow );
 5958 %}
 5959 
 5960 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 5961   predicate((UseAVX > 0) &&
 5962             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5963   match(Set dst (DivVF src (LoadVector mem)));
 5964   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 5965   ins_encode %{
 5966     int vlen_enc = vector_length_encoding(this);
 5967     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5968   %}
 5969   ins_pipe( pipe_slow );
 5970 %}
 5971 
 5972 // Doubles vector div
 5973 instruct vdivD(vec dst, vec src) %{
 5974   predicate(UseAVX == 0);
 5975   match(Set dst (DivVD dst src));
 5976   format %{ "divpd   $dst,$src\t! div packedD" %}
 5977   ins_encode %{
 5978     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 5979   %}
 5980   ins_pipe( pipe_slow );
 5981 %}
 5982 
 5983 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 5984   predicate(UseAVX > 0);
 5985   match(Set dst (DivVD src1 src2));
 5986   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 5987   ins_encode %{
 5988     int vlen_enc = vector_length_encoding(this);
 5989     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5990   %}
 5991   ins_pipe( pipe_slow );
 5992 %}
 5993 
 5994 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 5995   predicate((UseAVX > 0) &&
 5996             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5997   match(Set dst (DivVD src (LoadVector mem)));
 5998   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 5999   ins_encode %{
 6000     int vlen_enc = vector_length_encoding(this);
 6001     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6002   %}
 6003   ins_pipe( pipe_slow );
 6004 %}
 6005 
 6006 // ------------------------------ MinMax ---------------------------------------
 6007 
 6008 // Byte, Short, Int vector Min/Max
 6009 instruct minmax_reg_sse(vec dst, vec src) %{
 6010   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6011             UseAVX == 0);
 6012   match(Set dst (MinV dst src));
 6013   match(Set dst (MaxV dst src));
 6014   format %{ "vector_minmax  $dst,$src\t!  " %}
 6015   ins_encode %{
 6016     assert(UseSSE >= 4, "required");
 6017 
 6018     int opcode = this->ideal_Opcode();
 6019     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6020     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6021   %}
 6022   ins_pipe( pipe_slow );
 6023 %}
 6024 
 6025 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6026   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6027             UseAVX > 0);
 6028   match(Set dst (MinV src1 src2));
 6029   match(Set dst (MaxV src1 src2));
 6030   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6031   ins_encode %{
 6032     int opcode = this->ideal_Opcode();
 6033     int vlen_enc = vector_length_encoding(this);
 6034     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6035 
 6036     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6037   %}
 6038   ins_pipe( pipe_slow );
 6039 %}
 6040 
 6041 // Long vector Min/Max
 6042 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6043   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6044             UseAVX == 0);
 6045   match(Set dst (MinV dst src));
 6046   match(Set dst (MaxV src dst));
 6047   effect(TEMP dst, TEMP tmp);
 6048   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6049   ins_encode %{
 6050     assert(UseSSE >= 4, "required");
 6051 
 6052     int opcode = this->ideal_Opcode();
 6053     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6054     assert(elem_bt == T_LONG, "sanity");
 6055 
 6056     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6057   %}
 6058   ins_pipe( pipe_slow );
 6059 %}
 6060 
 6061 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6062   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6063             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6064   match(Set dst (MinV src1 src2));
 6065   match(Set dst (MaxV src1 src2));
 6066   effect(TEMP dst);
 6067   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6068   ins_encode %{
 6069     int vlen_enc = vector_length_encoding(this);
 6070     int opcode = this->ideal_Opcode();
 6071     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6072     assert(elem_bt == T_LONG, "sanity");
 6073 
 6074     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6075   %}
 6076   ins_pipe( pipe_slow );
 6077 %}
 6078 
 6079 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6080   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6081             Matcher::vector_element_basic_type(n) == T_LONG);
 6082   match(Set dst (MinV src1 src2));
 6083   match(Set dst (MaxV src1 src2));
 6084   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6085   ins_encode %{
 6086     assert(UseAVX > 2, "required");
 6087 
 6088     int vlen_enc = vector_length_encoding(this);
 6089     int opcode = this->ideal_Opcode();
 6090     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6091     assert(elem_bt == T_LONG, "sanity");
 6092 
 6093     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6094   %}
 6095   ins_pipe( pipe_slow );
 6096 %}
 6097 
 6098 // Float/Double vector Min/Max
 6099 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6100   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6101             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6102             UseAVX > 0);
 6103   match(Set dst (MinV a b));
 6104   match(Set dst (MaxV a b));
 6105   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6106   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6107   ins_encode %{
 6108     assert(UseAVX > 0, "required");
 6109 
 6110     int opcode = this->ideal_Opcode();
 6111     int vlen_enc = vector_length_encoding(this);
 6112     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6113 
 6114     __ vminmax_fp(opcode, elem_bt,
 6115                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6116                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6117   %}
 6118   ins_pipe( pipe_slow );
 6119 %}
 6120 
 6121 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6122   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6123             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6124   match(Set dst (MinV a b));
 6125   match(Set dst (MaxV a b));
 6126   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6127   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6128   ins_encode %{
 6129     assert(UseAVX > 2, "required");
 6130 
 6131     int opcode = this->ideal_Opcode();
 6132     int vlen_enc = vector_length_encoding(this);
 6133     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6134 
 6135     __ evminmax_fp(opcode, elem_bt,
 6136                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6137                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6138   %}
 6139   ins_pipe( pipe_slow );
 6140 %}
 6141 
 6142 // --------------------------------- Signum/CopySign ---------------------------
 6143 
 6144 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6145   match(Set dst (SignumF dst (Binary zero one)));
 6146   effect(KILL cr);
 6147   format %{ "signumF $dst, $dst" %}
 6148   ins_encode %{
 6149     int opcode = this->ideal_Opcode();
 6150     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6151   %}
 6152   ins_pipe( pipe_slow );
 6153 %}
 6154 
 6155 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6156   match(Set dst (SignumD dst (Binary zero one)));
 6157   effect(KILL cr);
 6158   format %{ "signumD $dst, $dst" %}
 6159   ins_encode %{
 6160     int opcode = this->ideal_Opcode();
 6161     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6162   %}
 6163   ins_pipe( pipe_slow );
 6164 %}
 6165 
 6166 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6167   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6168   match(Set dst (SignumVF src (Binary zero one)));
 6169   match(Set dst (SignumVD src (Binary zero one)));
 6170   effect(TEMP dst, TEMP xtmp1);
 6171   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6172   ins_encode %{
 6173     int opcode = this->ideal_Opcode();
 6174     int vec_enc = vector_length_encoding(this);
 6175     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6176                          $xtmp1$$XMMRegister, vec_enc);
 6177   %}
 6178   ins_pipe( pipe_slow );
 6179 %}
 6180 
 6181 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6182   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6183   match(Set dst (SignumVF src (Binary zero one)));
 6184   match(Set dst (SignumVD src (Binary zero one)));
 6185   effect(TEMP dst, TEMP ktmp1);
 6186   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6187   ins_encode %{
 6188     int opcode = this->ideal_Opcode();
 6189     int vec_enc = vector_length_encoding(this);
 6190     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6191                           $ktmp1$$KRegister, vec_enc);
 6192   %}
 6193   ins_pipe( pipe_slow );
 6194 %}
 6195 
 6196 // ---------------------------------------
 6197 // For copySign use 0xE4 as writemask for vpternlog
 6198 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6199 // C (xmm2) is set to 0x7FFFFFFF
 6200 // Wherever xmm2 is 0, we want to pick from B (sign)
 6201 // Wherever xmm2 is 1, we want to pick from A (src)
 6202 //
 6203 // A B C Result
 6204 // 0 0 0 0
 6205 // 0 0 1 0
 6206 // 0 1 0 1
 6207 // 0 1 1 0
 6208 // 1 0 0 0
 6209 // 1 0 1 1
 6210 // 1 1 0 1
 6211 // 1 1 1 1
 6212 //
 6213 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6214 // ---------------------------------------
 6215 
 6216 #ifdef _LP64
 6217 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6218   match(Set dst (CopySignF dst src));
 6219   effect(TEMP tmp1, TEMP tmp2);
 6220   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6221   ins_encode %{
 6222     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6223     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6224     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6225   %}
 6226   ins_pipe( pipe_slow );
 6227 %}
 6228 
 6229 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6230   match(Set dst (CopySignD dst (Binary src zero)));
 6231   ins_cost(100);
 6232   effect(TEMP tmp1, TEMP tmp2);
 6233   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6234   ins_encode %{
 6235     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6236     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6237     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6238   %}
 6239   ins_pipe( pipe_slow );
 6240 %}
 6241 
 6242 #endif // _LP64
 6243 
 6244 //----------------------------- CompressBits/ExpandBits ------------------------
 6245 
 6246 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6247   predicate(n->bottom_type()->isa_int());
 6248   match(Set dst (CompressBits src mask));
 6249   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6250   ins_encode %{
 6251     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6252   %}
 6253   ins_pipe( pipe_slow );
 6254 %}
 6255 
 6256 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6257   predicate(n->bottom_type()->isa_int());
 6258   match(Set dst (ExpandBits src mask));
 6259   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6260   ins_encode %{
 6261     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6262   %}
 6263   ins_pipe( pipe_slow );
 6264 %}
 6265 
 6266 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6267   predicate(n->bottom_type()->isa_int());
 6268   match(Set dst (CompressBits src (LoadI mask)));
 6269   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6270   ins_encode %{
 6271     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6272   %}
 6273   ins_pipe( pipe_slow );
 6274 %}
 6275 
 6276 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6277   predicate(n->bottom_type()->isa_int());
 6278   match(Set dst (ExpandBits src (LoadI mask)));
 6279   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6280   ins_encode %{
 6281     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6282   %}
 6283   ins_pipe( pipe_slow );
 6284 %}
 6285 
 6286 // --------------------------------- Sqrt --------------------------------------
 6287 
 6288 instruct vsqrtF_reg(vec dst, vec src) %{
 6289   match(Set dst (SqrtVF src));
 6290   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6291   ins_encode %{
 6292     assert(UseAVX > 0, "required");
 6293     int vlen_enc = vector_length_encoding(this);
 6294     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6295   %}
 6296   ins_pipe( pipe_slow );
 6297 %}
 6298 
 6299 instruct vsqrtF_mem(vec dst, memory mem) %{
 6300   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6301   match(Set dst (SqrtVF (LoadVector mem)));
 6302   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6303   ins_encode %{
 6304     assert(UseAVX > 0, "required");
 6305     int vlen_enc = vector_length_encoding(this);
 6306     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6307   %}
 6308   ins_pipe( pipe_slow );
 6309 %}
 6310 
 6311 // Floating point vector sqrt
 6312 instruct vsqrtD_reg(vec dst, vec src) %{
 6313   match(Set dst (SqrtVD src));
 6314   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6315   ins_encode %{
 6316     assert(UseAVX > 0, "required");
 6317     int vlen_enc = vector_length_encoding(this);
 6318     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6319   %}
 6320   ins_pipe( pipe_slow );
 6321 %}
 6322 
 6323 instruct vsqrtD_mem(vec dst, memory mem) %{
 6324   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6325   match(Set dst (SqrtVD (LoadVector mem)));
 6326   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6327   ins_encode %{
 6328     assert(UseAVX > 0, "required");
 6329     int vlen_enc = vector_length_encoding(this);
 6330     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6331   %}
 6332   ins_pipe( pipe_slow );
 6333 %}
 6334 
 6335 // ------------------------------ Shift ---------------------------------------
 6336 
 6337 // Left and right shift count vectors are the same on x86
 6338 // (only lowest bits of xmm reg are used for count).
 6339 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6340   match(Set dst (LShiftCntV cnt));
 6341   match(Set dst (RShiftCntV cnt));
 6342   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6343   ins_encode %{
 6344     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6345   %}
 6346   ins_pipe( pipe_slow );
 6347 %}
 6348 
 6349 // Byte vector shift
 6350 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6351   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6352   match(Set dst ( LShiftVB src shift));
 6353   match(Set dst ( RShiftVB src shift));
 6354   match(Set dst (URShiftVB src shift));
 6355   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6356   format %{"vector_byte_shift $dst,$src,$shift" %}
 6357   ins_encode %{
 6358     assert(UseSSE > 3, "required");
 6359     int opcode = this->ideal_Opcode();
 6360     bool sign = (opcode != Op_URShiftVB);
 6361     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6362     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6363     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6364     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6365     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6366   %}
 6367   ins_pipe( pipe_slow );
 6368 %}
 6369 
 6370 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6371   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6372             UseAVX <= 1);
 6373   match(Set dst ( LShiftVB src shift));
 6374   match(Set dst ( RShiftVB src shift));
 6375   match(Set dst (URShiftVB src shift));
 6376   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6377   format %{"vector_byte_shift $dst,$src,$shift" %}
 6378   ins_encode %{
 6379     assert(UseSSE > 3, "required");
 6380     int opcode = this->ideal_Opcode();
 6381     bool sign = (opcode != Op_URShiftVB);
 6382     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6383     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6384     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6385     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6386     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6387     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6388     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6389     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6390     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6391   %}
 6392   ins_pipe( pipe_slow );
 6393 %}
 6394 
 6395 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6396   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6397             UseAVX > 1);
 6398   match(Set dst ( LShiftVB src shift));
 6399   match(Set dst ( RShiftVB src shift));
 6400   match(Set dst (URShiftVB src shift));
 6401   effect(TEMP dst, TEMP tmp);
 6402   format %{"vector_byte_shift $dst,$src,$shift" %}
 6403   ins_encode %{
 6404     int opcode = this->ideal_Opcode();
 6405     bool sign = (opcode != Op_URShiftVB);
 6406     int vlen_enc = Assembler::AVX_256bit;
 6407     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6408     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6409     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6410     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6411     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6412   %}
 6413   ins_pipe( pipe_slow );
 6414 %}
 6415 
 6416 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6417   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6418   match(Set dst ( LShiftVB src shift));
 6419   match(Set dst ( RShiftVB src shift));
 6420   match(Set dst (URShiftVB src shift));
 6421   effect(TEMP dst, TEMP tmp);
 6422   format %{"vector_byte_shift $dst,$src,$shift" %}
 6423   ins_encode %{
 6424     assert(UseAVX > 1, "required");
 6425     int opcode = this->ideal_Opcode();
 6426     bool sign = (opcode != Op_URShiftVB);
 6427     int vlen_enc = Assembler::AVX_256bit;
 6428     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6429     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6430     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6431     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6432     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6433     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6434     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6435     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6436     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6437   %}
 6438   ins_pipe( pipe_slow );
 6439 %}
 6440 
 6441 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6442   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6443   match(Set dst ( LShiftVB src shift));
 6444   match(Set dst  (RShiftVB src shift));
 6445   match(Set dst (URShiftVB src shift));
 6446   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6447   format %{"vector_byte_shift $dst,$src,$shift" %}
 6448   ins_encode %{
 6449     assert(UseAVX > 2, "required");
 6450     int opcode = this->ideal_Opcode();
 6451     bool sign = (opcode != Op_URShiftVB);
 6452     int vlen_enc = Assembler::AVX_512bit;
 6453     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6454     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6455     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6456     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6457     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6458     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6459     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6460     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6461     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6462     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6463     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6464     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6465   %}
 6466   ins_pipe( pipe_slow );
 6467 %}
 6468 
 6469 // Shorts vector logical right shift produces incorrect Java result
 6470 // for negative data because java code convert short value into int with
 6471 // sign extension before a shift. But char vectors are fine since chars are
 6472 // unsigned values.
 6473 // Shorts/Chars vector left shift
 6474 instruct vshiftS(vec dst, vec src, vec shift) %{
 6475   predicate(!n->as_ShiftV()->is_var_shift());
 6476   match(Set dst ( LShiftVS src shift));
 6477   match(Set dst ( RShiftVS src shift));
 6478   match(Set dst (URShiftVS src shift));
 6479   effect(TEMP dst, USE src, USE shift);
 6480   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6481   ins_encode %{
 6482     int opcode = this->ideal_Opcode();
 6483     if (UseAVX > 0) {
 6484       int vlen_enc = vector_length_encoding(this);
 6485       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6486     } else {
 6487       int vlen = Matcher::vector_length(this);
 6488       if (vlen == 2) {
 6489         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6490         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6491       } else if (vlen == 4) {
 6492         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6493         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6494       } else {
 6495         assert (vlen == 8, "sanity");
 6496         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6497         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6498       }
 6499     }
 6500   %}
 6501   ins_pipe( pipe_slow );
 6502 %}
 6503 
 6504 // Integers vector left shift
 6505 instruct vshiftI(vec dst, vec src, vec shift) %{
 6506   predicate(!n->as_ShiftV()->is_var_shift());
 6507   match(Set dst ( LShiftVI src shift));
 6508   match(Set dst ( RShiftVI src shift));
 6509   match(Set dst (URShiftVI src shift));
 6510   effect(TEMP dst, USE src, USE shift);
 6511   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6512   ins_encode %{
 6513     int opcode = this->ideal_Opcode();
 6514     if (UseAVX > 0) {
 6515       int vlen_enc = vector_length_encoding(this);
 6516       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6517     } else {
 6518       int vlen = Matcher::vector_length(this);
 6519       if (vlen == 2) {
 6520         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6521         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6522       } else {
 6523         assert(vlen == 4, "sanity");
 6524         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6525         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6526       }
 6527     }
 6528   %}
 6529   ins_pipe( pipe_slow );
 6530 %}
 6531 
 6532 // Integers vector left constant shift
 6533 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6534   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6535   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6536   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6537   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6538   ins_encode %{
 6539     int opcode = this->ideal_Opcode();
 6540     if (UseAVX > 0) {
 6541       int vector_len = vector_length_encoding(this);
 6542       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6543     } else {
 6544       int vlen = Matcher::vector_length(this);
 6545       if (vlen == 2) {
 6546         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6547         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6548       } else {
 6549         assert(vlen == 4, "sanity");
 6550         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6551         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6552       }
 6553     }
 6554   %}
 6555   ins_pipe( pipe_slow );
 6556 %}
 6557 
 6558 // Longs vector shift
 6559 instruct vshiftL(vec dst, vec src, vec shift) %{
 6560   predicate(!n->as_ShiftV()->is_var_shift());
 6561   match(Set dst ( LShiftVL src shift));
 6562   match(Set dst (URShiftVL src shift));
 6563   effect(TEMP dst, USE src, USE shift);
 6564   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6565   ins_encode %{
 6566     int opcode = this->ideal_Opcode();
 6567     if (UseAVX > 0) {
 6568       int vlen_enc = vector_length_encoding(this);
 6569       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6570     } else {
 6571       assert(Matcher::vector_length(this) == 2, "");
 6572       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6573       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6574     }
 6575   %}
 6576   ins_pipe( pipe_slow );
 6577 %}
 6578 
 6579 // Longs vector constant shift
 6580 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6581   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6582   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6583   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6584   ins_encode %{
 6585     int opcode = this->ideal_Opcode();
 6586     if (UseAVX > 0) {
 6587       int vector_len = vector_length_encoding(this);
 6588       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6589     } else {
 6590       assert(Matcher::vector_length(this) == 2, "");
 6591       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6592       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6593     }
 6594   %}
 6595   ins_pipe( pipe_slow );
 6596 %}
 6597 
 6598 // -------------------ArithmeticRightShift -----------------------------------
 6599 // Long vector arithmetic right shift
 6600 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6601   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6602   match(Set dst (RShiftVL src shift));
 6603   effect(TEMP dst, TEMP tmp);
 6604   format %{ "vshiftq $dst,$src,$shift" %}
 6605   ins_encode %{
 6606     uint vlen = Matcher::vector_length(this);
 6607     if (vlen == 2) {
 6608       assert(UseSSE >= 2, "required");
 6609       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6610       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6611       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6612       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6613       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6614       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6615     } else {
 6616       assert(vlen == 4, "sanity");
 6617       assert(UseAVX > 1, "required");
 6618       int vlen_enc = Assembler::AVX_256bit;
 6619       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6620       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6621       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6622       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6623       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6624     }
 6625   %}
 6626   ins_pipe( pipe_slow );
 6627 %}
 6628 
 6629 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6630   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6631   match(Set dst (RShiftVL src shift));
 6632   format %{ "vshiftq $dst,$src,$shift" %}
 6633   ins_encode %{
 6634     int vlen_enc = vector_length_encoding(this);
 6635     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6636   %}
 6637   ins_pipe( pipe_slow );
 6638 %}
 6639 
 6640 // ------------------- Variable Shift -----------------------------
 6641 // Byte variable shift
 6642 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6643   predicate(Matcher::vector_length(n) <= 8 &&
 6644             n->as_ShiftV()->is_var_shift() &&
 6645             !VM_Version::supports_avx512bw());
 6646   match(Set dst ( LShiftVB src shift));
 6647   match(Set dst ( RShiftVB src shift));
 6648   match(Set dst (URShiftVB src shift));
 6649   effect(TEMP dst, TEMP vtmp);
 6650   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6651   ins_encode %{
 6652     assert(UseAVX >= 2, "required");
 6653 
 6654     int opcode = this->ideal_Opcode();
 6655     int vlen_enc = Assembler::AVX_128bit;
 6656     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6657     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6658   %}
 6659   ins_pipe( pipe_slow );
 6660 %}
 6661 
 6662 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6663   predicate(Matcher::vector_length(n) == 16 &&
 6664             n->as_ShiftV()->is_var_shift() &&
 6665             !VM_Version::supports_avx512bw());
 6666   match(Set dst ( LShiftVB src shift));
 6667   match(Set dst ( RShiftVB src shift));
 6668   match(Set dst (URShiftVB src shift));
 6669   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6670   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6671   ins_encode %{
 6672     assert(UseAVX >= 2, "required");
 6673 
 6674     int opcode = this->ideal_Opcode();
 6675     int vlen_enc = Assembler::AVX_128bit;
 6676     // Shift lower half and get word result in dst
 6677     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6678 
 6679     // Shift upper half and get word result in vtmp1
 6680     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6681     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6682     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6683 
 6684     // Merge and down convert the two word results to byte in dst
 6685     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6686   %}
 6687   ins_pipe( pipe_slow );
 6688 %}
 6689 
 6690 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6691   predicate(Matcher::vector_length(n) == 32 &&
 6692             n->as_ShiftV()->is_var_shift() &&
 6693             !VM_Version::supports_avx512bw());
 6694   match(Set dst ( LShiftVB src shift));
 6695   match(Set dst ( RShiftVB src shift));
 6696   match(Set dst (URShiftVB src shift));
 6697   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6698   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6699   ins_encode %{
 6700     assert(UseAVX >= 2, "required");
 6701 
 6702     int opcode = this->ideal_Opcode();
 6703     int vlen_enc = Assembler::AVX_128bit;
 6704     // Process lower 128 bits and get result in dst
 6705     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6706     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6707     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6708     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6709     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6710 
 6711     // Process higher 128 bits and get result in vtmp3
 6712     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6713     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6714     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6715     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6716     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6717     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6718     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6719 
 6720     // Merge the two results in dst
 6721     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6722   %}
 6723   ins_pipe( pipe_slow );
 6724 %}
 6725 
 6726 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6727   predicate(Matcher::vector_length(n) <= 32 &&
 6728             n->as_ShiftV()->is_var_shift() &&
 6729             VM_Version::supports_avx512bw());
 6730   match(Set dst ( LShiftVB src shift));
 6731   match(Set dst ( RShiftVB src shift));
 6732   match(Set dst (URShiftVB src shift));
 6733   effect(TEMP dst, TEMP vtmp);
 6734   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6735   ins_encode %{
 6736     assert(UseAVX > 2, "required");
 6737 
 6738     int opcode = this->ideal_Opcode();
 6739     int vlen_enc = vector_length_encoding(this);
 6740     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6741   %}
 6742   ins_pipe( pipe_slow );
 6743 %}
 6744 
 6745 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6746   predicate(Matcher::vector_length(n) == 64 &&
 6747             n->as_ShiftV()->is_var_shift() &&
 6748             VM_Version::supports_avx512bw());
 6749   match(Set dst ( LShiftVB src shift));
 6750   match(Set dst ( RShiftVB src shift));
 6751   match(Set dst (URShiftVB src shift));
 6752   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6753   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6754   ins_encode %{
 6755     assert(UseAVX > 2, "required");
 6756 
 6757     int opcode = this->ideal_Opcode();
 6758     int vlen_enc = Assembler::AVX_256bit;
 6759     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6760     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6761     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6762     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6763     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6764   %}
 6765   ins_pipe( pipe_slow );
 6766 %}
 6767 
 6768 // Short variable shift
 6769 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6770   predicate(Matcher::vector_length(n) <= 8 &&
 6771             n->as_ShiftV()->is_var_shift() &&
 6772             !VM_Version::supports_avx512bw());
 6773   match(Set dst ( LShiftVS src shift));
 6774   match(Set dst ( RShiftVS src shift));
 6775   match(Set dst (URShiftVS src shift));
 6776   effect(TEMP dst, TEMP vtmp);
 6777   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6778   ins_encode %{
 6779     assert(UseAVX >= 2, "required");
 6780 
 6781     int opcode = this->ideal_Opcode();
 6782     bool sign = (opcode != Op_URShiftVS);
 6783     int vlen_enc = Assembler::AVX_256bit;
 6784     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6785     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6786     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6787     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6788     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6789     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6790   %}
 6791   ins_pipe( pipe_slow );
 6792 %}
 6793 
 6794 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6795   predicate(Matcher::vector_length(n) == 16 &&
 6796             n->as_ShiftV()->is_var_shift() &&
 6797             !VM_Version::supports_avx512bw());
 6798   match(Set dst ( LShiftVS src shift));
 6799   match(Set dst ( RShiftVS src shift));
 6800   match(Set dst (URShiftVS src shift));
 6801   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6802   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6803   ins_encode %{
 6804     assert(UseAVX >= 2, "required");
 6805 
 6806     int opcode = this->ideal_Opcode();
 6807     bool sign = (opcode != Op_URShiftVS);
 6808     int vlen_enc = Assembler::AVX_256bit;
 6809     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6810     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6811     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6812     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6813     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6814 
 6815     // Shift upper half, with result in dst using vtmp1 as TEMP
 6816     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6817     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6818     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6819     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6820     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6821     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6822 
 6823     // Merge lower and upper half result into dst
 6824     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6825     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6826   %}
 6827   ins_pipe( pipe_slow );
 6828 %}
 6829 
 6830 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6831   predicate(n->as_ShiftV()->is_var_shift() &&
 6832             VM_Version::supports_avx512bw());
 6833   match(Set dst ( LShiftVS src shift));
 6834   match(Set dst ( RShiftVS src shift));
 6835   match(Set dst (URShiftVS src shift));
 6836   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6837   ins_encode %{
 6838     assert(UseAVX > 2, "required");
 6839 
 6840     int opcode = this->ideal_Opcode();
 6841     int vlen_enc = vector_length_encoding(this);
 6842     if (!VM_Version::supports_avx512vl()) {
 6843       vlen_enc = Assembler::AVX_512bit;
 6844     }
 6845     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6846   %}
 6847   ins_pipe( pipe_slow );
 6848 %}
 6849 
 6850 //Integer variable shift
 6851 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6852   predicate(n->as_ShiftV()->is_var_shift());
 6853   match(Set dst ( LShiftVI src shift));
 6854   match(Set dst ( RShiftVI src shift));
 6855   match(Set dst (URShiftVI src shift));
 6856   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6857   ins_encode %{
 6858     assert(UseAVX >= 2, "required");
 6859 
 6860     int opcode = this->ideal_Opcode();
 6861     int vlen_enc = vector_length_encoding(this);
 6862     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6863   %}
 6864   ins_pipe( pipe_slow );
 6865 %}
 6866 
 6867 //Long variable shift
 6868 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6869   predicate(n->as_ShiftV()->is_var_shift());
 6870   match(Set dst ( LShiftVL src shift));
 6871   match(Set dst (URShiftVL src shift));
 6872   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6873   ins_encode %{
 6874     assert(UseAVX >= 2, "required");
 6875 
 6876     int opcode = this->ideal_Opcode();
 6877     int vlen_enc = vector_length_encoding(this);
 6878     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6879   %}
 6880   ins_pipe( pipe_slow );
 6881 %}
 6882 
 6883 //Long variable right shift arithmetic
 6884 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6885   predicate(Matcher::vector_length(n) <= 4 &&
 6886             n->as_ShiftV()->is_var_shift() &&
 6887             UseAVX == 2);
 6888   match(Set dst (RShiftVL src shift));
 6889   effect(TEMP dst, TEMP vtmp);
 6890   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6891   ins_encode %{
 6892     int opcode = this->ideal_Opcode();
 6893     int vlen_enc = vector_length_encoding(this);
 6894     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 6895                  $vtmp$$XMMRegister);
 6896   %}
 6897   ins_pipe( pipe_slow );
 6898 %}
 6899 
 6900 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 6901   predicate(n->as_ShiftV()->is_var_shift() &&
 6902             UseAVX > 2);
 6903   match(Set dst (RShiftVL src shift));
 6904   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 6905   ins_encode %{
 6906     int opcode = this->ideal_Opcode();
 6907     int vlen_enc = vector_length_encoding(this);
 6908     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6909   %}
 6910   ins_pipe( pipe_slow );
 6911 %}
 6912 
 6913 // --------------------------------- AND --------------------------------------
 6914 
 6915 instruct vand(vec dst, vec src) %{
 6916   predicate(UseAVX == 0);
 6917   match(Set dst (AndV dst src));
 6918   format %{ "pand    $dst,$src\t! and vectors" %}
 6919   ins_encode %{
 6920     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 6921   %}
 6922   ins_pipe( pipe_slow );
 6923 %}
 6924 
 6925 instruct vand_reg(vec dst, vec src1, vec src2) %{
 6926   predicate(UseAVX > 0);
 6927   match(Set dst (AndV src1 src2));
 6928   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 6929   ins_encode %{
 6930     int vlen_enc = vector_length_encoding(this);
 6931     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6932   %}
 6933   ins_pipe( pipe_slow );
 6934 %}
 6935 
 6936 instruct vand_mem(vec dst, vec src, memory mem) %{
 6937   predicate((UseAVX > 0) &&
 6938             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6939   match(Set dst (AndV src (LoadVector mem)));
 6940   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 6941   ins_encode %{
 6942     int vlen_enc = vector_length_encoding(this);
 6943     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6944   %}
 6945   ins_pipe( pipe_slow );
 6946 %}
 6947 
 6948 // --------------------------------- OR ---------------------------------------
 6949 
 6950 instruct vor(vec dst, vec src) %{
 6951   predicate(UseAVX == 0);
 6952   match(Set dst (OrV dst src));
 6953   format %{ "por     $dst,$src\t! or vectors" %}
 6954   ins_encode %{
 6955     __ por($dst$$XMMRegister, $src$$XMMRegister);
 6956   %}
 6957   ins_pipe( pipe_slow );
 6958 %}
 6959 
 6960 instruct vor_reg(vec dst, vec src1, vec src2) %{
 6961   predicate(UseAVX > 0);
 6962   match(Set dst (OrV src1 src2));
 6963   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 6964   ins_encode %{
 6965     int vlen_enc = vector_length_encoding(this);
 6966     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6967   %}
 6968   ins_pipe( pipe_slow );
 6969 %}
 6970 
 6971 instruct vor_mem(vec dst, vec src, memory mem) %{
 6972   predicate((UseAVX > 0) &&
 6973             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6974   match(Set dst (OrV src (LoadVector mem)));
 6975   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 6976   ins_encode %{
 6977     int vlen_enc = vector_length_encoding(this);
 6978     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6979   %}
 6980   ins_pipe( pipe_slow );
 6981 %}
 6982 
 6983 // --------------------------------- XOR --------------------------------------
 6984 
 6985 instruct vxor(vec dst, vec src) %{
 6986   predicate(UseAVX == 0);
 6987   match(Set dst (XorV dst src));
 6988   format %{ "pxor    $dst,$src\t! xor vectors" %}
 6989   ins_encode %{
 6990     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 6991   %}
 6992   ins_pipe( pipe_slow );
 6993 %}
 6994 
 6995 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 6996   predicate(UseAVX > 0);
 6997   match(Set dst (XorV src1 src2));
 6998   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 6999   ins_encode %{
 7000     int vlen_enc = vector_length_encoding(this);
 7001     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7002   %}
 7003   ins_pipe( pipe_slow );
 7004 %}
 7005 
 7006 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7007   predicate((UseAVX > 0) &&
 7008             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7009   match(Set dst (XorV src (LoadVector mem)));
 7010   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7011   ins_encode %{
 7012     int vlen_enc = vector_length_encoding(this);
 7013     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7014   %}
 7015   ins_pipe( pipe_slow );
 7016 %}
 7017 
 7018 // --------------------------------- VectorCast --------------------------------------
 7019 
 7020 instruct vcastBtoX(vec dst, vec src) %{
 7021   match(Set dst (VectorCastB2X src));
 7022   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7023   ins_encode %{
 7024     assert(UseAVX > 0, "required");
 7025 
 7026     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7027     int vlen_enc = vector_length_encoding(this);
 7028     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7029   %}
 7030   ins_pipe( pipe_slow );
 7031 %}
 7032 
 7033 instruct castStoX(vec dst, vec src) %{
 7034   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7035             Matcher::vector_length(n->in(1)) <= 8 && // src
 7036             Matcher::vector_element_basic_type(n) == T_BYTE);
 7037   match(Set dst (VectorCastS2X src));
 7038   format %{ "vector_cast_s2x $dst,$src" %}
 7039   ins_encode %{
 7040     assert(UseAVX > 0, "required");
 7041 
 7042     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7043     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7044   %}
 7045   ins_pipe( pipe_slow );
 7046 %}
 7047 
 7048 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7049   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7050             Matcher::vector_length(n->in(1)) == 16 && // src
 7051             Matcher::vector_element_basic_type(n) == T_BYTE);
 7052   effect(TEMP dst, TEMP vtmp);
 7053   match(Set dst (VectorCastS2X src));
 7054   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7055   ins_encode %{
 7056     assert(UseAVX > 0, "required");
 7057 
 7058     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7059     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7060     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7061     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7062   %}
 7063   ins_pipe( pipe_slow );
 7064 %}
 7065 
 7066 instruct vcastStoX_evex(vec dst, vec src) %{
 7067   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7068             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7069   match(Set dst (VectorCastS2X src));
 7070   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7071   ins_encode %{
 7072     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7073     int src_vlen_enc = vector_length_encoding(this, $src);
 7074     int vlen_enc = vector_length_encoding(this);
 7075     switch (to_elem_bt) {
 7076       case T_BYTE:
 7077         if (!VM_Version::supports_avx512vl()) {
 7078           vlen_enc = Assembler::AVX_512bit;
 7079         }
 7080         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7081         break;
 7082       case T_INT:
 7083         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7084         break;
 7085       case T_FLOAT:
 7086         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7087         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7088         break;
 7089       case T_LONG:
 7090         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7091         break;
 7092       case T_DOUBLE: {
 7093         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7094         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7095         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7096         break;
 7097       }
 7098       default:
 7099         ShouldNotReachHere();
 7100     }
 7101   %}
 7102   ins_pipe( pipe_slow );
 7103 %}
 7104 
 7105 instruct castItoX(vec dst, vec src) %{
 7106   predicate(UseAVX <= 2 &&
 7107             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7108             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7109   match(Set dst (VectorCastI2X src));
 7110   format %{ "vector_cast_i2x $dst,$src" %}
 7111   ins_encode %{
 7112     assert(UseAVX > 0, "required");
 7113 
 7114     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7115     int vlen_enc = vector_length_encoding(this, $src);
 7116 
 7117     if (to_elem_bt == T_BYTE) {
 7118       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7119       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7120       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7121     } else {
 7122       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7123       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7124       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7125     }
 7126   %}
 7127   ins_pipe( pipe_slow );
 7128 %}
 7129 
 7130 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7131   predicate(UseAVX <= 2 &&
 7132             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7133             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7134   match(Set dst (VectorCastI2X src));
 7135   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7136   effect(TEMP dst, TEMP vtmp);
 7137   ins_encode %{
 7138     assert(UseAVX > 0, "required");
 7139 
 7140     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7141     int vlen_enc = vector_length_encoding(this, $src);
 7142 
 7143     if (to_elem_bt == T_BYTE) {
 7144       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7145       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7146       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7147       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7148     } else {
 7149       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7150       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7151       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7152       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7153     }
 7154   %}
 7155   ins_pipe( pipe_slow );
 7156 %}
 7157 
 7158 instruct vcastItoX_evex(vec dst, vec src) %{
 7159   predicate(UseAVX > 2 ||
 7160             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7161   match(Set dst (VectorCastI2X src));
 7162   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7163   ins_encode %{
 7164     assert(UseAVX > 0, "required");
 7165 
 7166     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7167     int src_vlen_enc = vector_length_encoding(this, $src);
 7168     int dst_vlen_enc = vector_length_encoding(this);
 7169     switch (dst_elem_bt) {
 7170       case T_BYTE:
 7171         if (!VM_Version::supports_avx512vl()) {
 7172           src_vlen_enc = Assembler::AVX_512bit;
 7173         }
 7174         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7175         break;
 7176       case T_SHORT:
 7177         if (!VM_Version::supports_avx512vl()) {
 7178           src_vlen_enc = Assembler::AVX_512bit;
 7179         }
 7180         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7181         break;
 7182       case T_FLOAT:
 7183         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7184         break;
 7185       case T_LONG:
 7186         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7187         break;
 7188       case T_DOUBLE:
 7189         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7190         break;
 7191       default:
 7192         ShouldNotReachHere();
 7193     }
 7194   %}
 7195   ins_pipe( pipe_slow );
 7196 %}
 7197 
 7198 instruct vcastLtoBS(vec dst, vec src) %{
 7199   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7200             UseAVX <= 2);
 7201   match(Set dst (VectorCastL2X src));
 7202   format %{ "vector_cast_l2x  $dst,$src" %}
 7203   ins_encode %{
 7204     assert(UseAVX > 0, "required");
 7205 
 7206     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7207     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7208     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7209                                                       : ExternalAddress(vector_int_to_short_mask());
 7210     if (vlen <= 16) {
 7211       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7212       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7213       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7214     } else {
 7215       assert(vlen <= 32, "required");
 7216       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7217       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7218       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7219       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7220     }
 7221     if (to_elem_bt == T_BYTE) {
 7222       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7223     }
 7224   %}
 7225   ins_pipe( pipe_slow );
 7226 %}
 7227 
 7228 instruct vcastLtoX_evex(vec dst, vec src) %{
 7229   predicate(UseAVX > 2 ||
 7230             (Matcher::vector_element_basic_type(n) == T_INT ||
 7231              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7232              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7233   match(Set dst (VectorCastL2X src));
 7234   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7235   ins_encode %{
 7236     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7237     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7238     int vlen_enc = vector_length_encoding(this, $src);
 7239     switch (to_elem_bt) {
 7240       case T_BYTE:
 7241         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7242           vlen_enc = Assembler::AVX_512bit;
 7243         }
 7244         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7245         break;
 7246       case T_SHORT:
 7247         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7248           vlen_enc = Assembler::AVX_512bit;
 7249         }
 7250         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7251         break;
 7252       case T_INT:
 7253         if (vlen == 8) {
 7254           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7255             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7256           }
 7257         } else if (vlen == 16) {
 7258           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7259         } else if (vlen == 32) {
 7260           if (UseAVX > 2) {
 7261             if (!VM_Version::supports_avx512vl()) {
 7262               vlen_enc = Assembler::AVX_512bit;
 7263             }
 7264             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7265           } else {
 7266             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7267             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7268           }
 7269         } else { // vlen == 64
 7270           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7271         }
 7272         break;
 7273       case T_FLOAT:
 7274         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7275         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7276         break;
 7277       case T_DOUBLE:
 7278         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7279         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7280         break;
 7281 
 7282       default: assert(false, "%s", type2name(to_elem_bt));
 7283     }
 7284   %}
 7285   ins_pipe( pipe_slow );
 7286 %}
 7287 
 7288 instruct vcastFtoD_reg(vec dst, vec src) %{
 7289   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7290   match(Set dst (VectorCastF2X src));
 7291   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7292   ins_encode %{
 7293     int vlen_enc = vector_length_encoding(this);
 7294     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7295   %}
 7296   ins_pipe( pipe_slow );
 7297 %}
 7298 
 7299 
 7300 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7301   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7302             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7303   match(Set dst (VectorCastF2X src));
 7304   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7305   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7306   ins_encode %{
 7307     int vlen_enc = vector_length_encoding(this, $src);
 7308     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7309     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7310     // 32 bit addresses for register indirect addressing mode since stub constants
 7311     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7312     // However, targets are free to increase this limit, but having a large code cache size
 7313     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7314     // cap we save a temporary register allocation which in limiting case can prevent
 7315     // spilling in high register pressure blocks.
 7316     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7317                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7318                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7319   %}
 7320   ins_pipe( pipe_slow );
 7321 %}
 7322 
 7323 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7324   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7325             is_integral_type(Matcher::vector_element_basic_type(n)));
 7326   match(Set dst (VectorCastF2X src));
 7327   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7328   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7329   ins_encode %{
 7330     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7331     if (to_elem_bt == T_LONG) {
 7332       int vlen_enc = vector_length_encoding(this);
 7333       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7334                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7335                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7336     } else {
 7337       int vlen_enc = vector_length_encoding(this, $src);
 7338       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7339                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7340                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7341     }
 7342   %}
 7343   ins_pipe( pipe_slow );
 7344 %}
 7345 
 7346 instruct vcastDtoF_reg(vec dst, vec src) %{
 7347   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7348   match(Set dst (VectorCastD2X src));
 7349   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7350   ins_encode %{
 7351     int vlen_enc = vector_length_encoding(this, $src);
 7352     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7353   %}
 7354   ins_pipe( pipe_slow );
 7355 %}
 7356 
 7357 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7358   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7359             is_integral_type(Matcher::vector_element_basic_type(n)));
 7360   match(Set dst (VectorCastD2X src));
 7361   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7362   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7363   ins_encode %{
 7364     int vlen_enc = vector_length_encoding(this, $src);
 7365     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7366     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7367                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7368                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7369   %}
 7370   ins_pipe( pipe_slow );
 7371 %}
 7372 
 7373 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7374   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7375             is_integral_type(Matcher::vector_element_basic_type(n)));
 7376   match(Set dst (VectorCastD2X src));
 7377   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7378   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7379   ins_encode %{
 7380     int vlen_enc = vector_length_encoding(this, $src);
 7381     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7382     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7383                               ExternalAddress(vector_float_signflip());
 7384     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7385                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7386   %}
 7387   ins_pipe( pipe_slow );
 7388 %}
 7389 
 7390 instruct vucast(vec dst, vec src) %{
 7391   match(Set dst (VectorUCastB2X src));
 7392   match(Set dst (VectorUCastS2X src));
 7393   match(Set dst (VectorUCastI2X src));
 7394   format %{ "vector_ucast $dst,$src\t!" %}
 7395   ins_encode %{
 7396     assert(UseAVX > 0, "required");
 7397 
 7398     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7399     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7400     int vlen_enc = vector_length_encoding(this);
 7401     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7402   %}
 7403   ins_pipe( pipe_slow );
 7404 %}
 7405 
 7406 #ifdef _LP64
 7407 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7408   predicate(!VM_Version::supports_avx512vl() &&
 7409             Matcher::vector_length_in_bytes(n) < 64 &&
 7410             Matcher::vector_element_basic_type(n) == T_INT);
 7411   match(Set dst (RoundVF src));
 7412   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7413   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7414   ins_encode %{
 7415     int vlen_enc = vector_length_encoding(this);
 7416     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7417     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7418                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7419                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7420   %}
 7421   ins_pipe( pipe_slow );
 7422 %}
 7423 
 7424 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7425   predicate((VM_Version::supports_avx512vl() ||
 7426              Matcher::vector_length_in_bytes(n) == 64) &&
 7427              Matcher::vector_element_basic_type(n) == T_INT);
 7428   match(Set dst (RoundVF src));
 7429   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7430   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7431   ins_encode %{
 7432     int vlen_enc = vector_length_encoding(this);
 7433     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7434     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7435                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7436                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7437   %}
 7438   ins_pipe( pipe_slow );
 7439 %}
 7440 
 7441 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7442   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7443   match(Set dst (RoundVD src));
 7444   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7445   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7446   ins_encode %{
 7447     int vlen_enc = vector_length_encoding(this);
 7448     InternalAddress new_mxcsr = $constantaddress((jint)(EnableX86ECoreOpts ? 0x3FBF : 0x3F80));
 7449     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7450                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7451                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7452   %}
 7453   ins_pipe( pipe_slow );
 7454 %}
 7455 
 7456 #endif // _LP64
 7457 
 7458 // --------------------------------- VectorMaskCmp --------------------------------------
 7459 
 7460 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7461   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7462             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7463             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7464             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7465   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7466   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7467   ins_encode %{
 7468     int vlen_enc = vector_length_encoding(this, $src1);
 7469     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7470     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7471       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7472     } else {
 7473       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7474     }
 7475   %}
 7476   ins_pipe( pipe_slow );
 7477 %}
 7478 
 7479 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7480   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7481             n->bottom_type()->isa_vectmask() == NULL &&
 7482             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7483   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7484   effect(TEMP ktmp);
 7485   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7486   ins_encode %{
 7487     int vlen_enc = Assembler::AVX_512bit;
 7488     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7489     KRegister mask = k0; // The comparison itself is not being masked.
 7490     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7491       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7492       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7493     } else {
 7494       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7495       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7496     }
 7497   %}
 7498   ins_pipe( pipe_slow );
 7499 %}
 7500 
 7501 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7502   predicate(n->bottom_type()->isa_vectmask() &&
 7503             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7504   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7505   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7506   ins_encode %{
 7507     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7508     int vlen_enc = vector_length_encoding(this, $src1);
 7509     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7510     KRegister mask = k0; // The comparison itself is not being masked.
 7511     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7512       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7513     } else {
 7514       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7515     }
 7516   %}
 7517   ins_pipe( pipe_slow );
 7518 %}
 7519 
 7520 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7521   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7522             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7523             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7524             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7525             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7526             (n->in(2)->get_int() == BoolTest::eq ||
 7527              n->in(2)->get_int() == BoolTest::lt ||
 7528              n->in(2)->get_int() == BoolTest::gt)); // cond
 7529   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7530   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7531   ins_encode %{
 7532     int vlen_enc = vector_length_encoding(this, $src1);
 7533     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7534     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7535     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7536   %}
 7537   ins_pipe( pipe_slow );
 7538 %}
 7539 
 7540 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7541   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7542             !Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7543             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7544             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7545             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7546             (n->in(2)->get_int() == BoolTest::ne ||
 7547              n->in(2)->get_int() == BoolTest::le ||
 7548              n->in(2)->get_int() == BoolTest::ge)); // cond
 7549   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7550   effect(TEMP dst, TEMP xtmp);
 7551   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7552   ins_encode %{
 7553     int vlen_enc = vector_length_encoding(this, $src1);
 7554     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7555     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7556     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7557   %}
 7558   ins_pipe( pipe_slow );
 7559 %}
 7560 
 7561 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7562   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7563             Matcher::is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7564             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7565             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7566             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7567   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7568   effect(TEMP dst, TEMP xtmp);
 7569   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7570   ins_encode %{
 7571     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7572     int vlen_enc = vector_length_encoding(this, $src1);
 7573     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7574     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7575 
 7576     if (vlen_enc == Assembler::AVX_128bit) {
 7577       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7578     } else {
 7579       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7580     }
 7581     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7582     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7583     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7584   %}
 7585   ins_pipe( pipe_slow );
 7586 %}
 7587 
 7588 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7589   predicate((n->bottom_type()->isa_vectmask() == NULL &&
 7590              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7591              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7592   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7593   effect(TEMP ktmp);
 7594   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7595   ins_encode %{
 7596     assert(UseAVX > 2, "required");
 7597 
 7598     int vlen_enc = vector_length_encoding(this, $src1);
 7599     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7600     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7601     KRegister mask = k0; // The comparison itself is not being masked.
 7602     bool merge = false;
 7603     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7604 
 7605     switch (src1_elem_bt) {
 7606       case T_INT: {
 7607         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7608         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7609         break;
 7610       }
 7611       case T_LONG: {
 7612         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7613         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7614         break;
 7615       }
 7616       default: assert(false, "%s", type2name(src1_elem_bt));
 7617     }
 7618   %}
 7619   ins_pipe( pipe_slow );
 7620 %}
 7621 
 7622 
 7623 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7624   predicate(n->bottom_type()->isa_vectmask() &&
 7625             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7626   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7627   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7628   ins_encode %{
 7629     assert(UseAVX > 2, "required");
 7630     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7631 
 7632     int vlen_enc = vector_length_encoding(this, $src1);
 7633     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7634     bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 7635     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7636 
 7637     // Comparison i
 7638     switch (src1_elem_bt) {
 7639       case T_BYTE: {
 7640         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7641         break;
 7642       }
 7643       case T_SHORT: {
 7644         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7645         break;
 7646       }
 7647       case T_INT: {
 7648         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7649         break;
 7650       }
 7651       case T_LONG: {
 7652         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7653         break;
 7654       }
 7655       default: assert(false, "%s", type2name(src1_elem_bt));
 7656     }
 7657   %}
 7658   ins_pipe( pipe_slow );
 7659 %}
 7660 
 7661 // Extract
 7662 
 7663 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7664   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7665   match(Set dst (ExtractI src idx));
 7666   match(Set dst (ExtractS src idx));
 7667 #ifdef _LP64
 7668   match(Set dst (ExtractB src idx));
 7669 #endif
 7670   format %{ "extractI $dst,$src,$idx\t!" %}
 7671   ins_encode %{
 7672     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7673 
 7674     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7675     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7676   %}
 7677   ins_pipe( pipe_slow );
 7678 %}
 7679 
 7680 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7681   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7682             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7683   match(Set dst (ExtractI src idx));
 7684   match(Set dst (ExtractS src idx));
 7685 #ifdef _LP64
 7686   match(Set dst (ExtractB src idx));
 7687 #endif
 7688   effect(TEMP vtmp);
 7689   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7690   ins_encode %{
 7691     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7692 
 7693     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7694     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7695     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7696   %}
 7697   ins_pipe( pipe_slow );
 7698 %}
 7699 
 7700 #ifdef _LP64
 7701 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7702   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7703   match(Set dst (ExtractL src idx));
 7704   format %{ "extractL $dst,$src,$idx\t!" %}
 7705   ins_encode %{
 7706     assert(UseSSE >= 4, "required");
 7707     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7708 
 7709     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7710   %}
 7711   ins_pipe( pipe_slow );
 7712 %}
 7713 
 7714 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7715   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7716             Matcher::vector_length(n->in(1)) == 8);  // src
 7717   match(Set dst (ExtractL src idx));
 7718   effect(TEMP vtmp);
 7719   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7720   ins_encode %{
 7721     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7722 
 7723     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7724     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7725   %}
 7726   ins_pipe( pipe_slow );
 7727 %}
 7728 #endif
 7729 
 7730 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7731   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7732   match(Set dst (ExtractF src idx));
 7733   effect(TEMP dst, TEMP vtmp);
 7734   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7735   ins_encode %{
 7736     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7737 
 7738     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7739   %}
 7740   ins_pipe( pipe_slow );
 7741 %}
 7742 
 7743 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7744   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7745             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7746   match(Set dst (ExtractF src idx));
 7747   effect(TEMP vtmp);
 7748   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7749   ins_encode %{
 7750     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7751 
 7752     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7753     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7754   %}
 7755   ins_pipe( pipe_slow );
 7756 %}
 7757 
 7758 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7759   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7760   match(Set dst (ExtractD src idx));
 7761   format %{ "extractD $dst,$src,$idx\t!" %}
 7762   ins_encode %{
 7763     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7764 
 7765     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7766   %}
 7767   ins_pipe( pipe_slow );
 7768 %}
 7769 
 7770 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7771   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7772             Matcher::vector_length(n->in(1)) == 8);  // src
 7773   match(Set dst (ExtractD src idx));
 7774   effect(TEMP vtmp);
 7775   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7776   ins_encode %{
 7777     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7778 
 7779     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7780     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7781   %}
 7782   ins_pipe( pipe_slow );
 7783 %}
 7784 
 7785 // --------------------------------- Vector Blend --------------------------------------
 7786 
 7787 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7788   predicate(UseAVX == 0);
 7789   match(Set dst (VectorBlend (Binary dst src) mask));
 7790   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7791   effect(TEMP tmp);
 7792   ins_encode %{
 7793     assert(UseSSE >= 4, "required");
 7794 
 7795     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7796       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7797     }
 7798     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7799   %}
 7800   ins_pipe( pipe_slow );
 7801 %}
 7802 
 7803 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7804   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 7805             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7806             Matcher::vector_length_in_bytes(n) <= 32 &&
 7807             is_integral_type(Matcher::vector_element_basic_type(n)));
 7808   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7809   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7810   ins_encode %{
 7811     int vlen_enc = vector_length_encoding(this);
 7812     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7813   %}
 7814   ins_pipe( pipe_slow );
 7815 %}
 7816 
 7817 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7818   predicate(UseAVX > 0 && !EnableX86ECoreOpts &&
 7819             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7820             Matcher::vector_length_in_bytes(n) <= 32 &&
 7821             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7822   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7823   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7824   ins_encode %{
 7825     int vlen_enc = vector_length_encoding(this);
 7826     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7827   %}
 7828   ins_pipe( pipe_slow );
 7829 %}
 7830 
 7831 instruct vblendvp(legVec dst, legVec src1, legVec src2, legVec mask, legVec vtmp) %{
 7832   predicate(UseAVX > 0 && EnableX86ECoreOpts &&
 7833             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7834             Matcher::vector_length_in_bytes(n) <= 32);
 7835   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7836   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $vtmp as TEMP" %}
 7837   effect(TEMP vtmp, TEMP dst);
 7838   ins_encode %{
 7839     int vlen_enc = vector_length_encoding(this);
 7840     __ vpandn($vtmp$$XMMRegister, $mask$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7841     __ vpand ($dst$$XMMRegister,  $mask$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7842     __ vpor  ($dst$$XMMRegister,  $dst$$XMMRegister,  $vtmp$$XMMRegister, vlen_enc);
 7843   %}
 7844   ins_pipe( pipe_slow );
 7845 %}
 7846 
 7847 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7848   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7849             n->in(2)->bottom_type()->isa_vectmask() == NULL);
 7850   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7851   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7852   effect(TEMP ktmp);
 7853   ins_encode %{
 7854      int vlen_enc = Assembler::AVX_512bit;
 7855      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7856     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7857     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7858   %}
 7859   ins_pipe( pipe_slow );
 7860 %}
 7861 
 7862 
 7863 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7864   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7865             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7866              VM_Version::supports_avx512bw()));
 7867   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7868   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7869   ins_encode %{
 7870     int vlen_enc = vector_length_encoding(this);
 7871     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7872     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7873   %}
 7874   ins_pipe( pipe_slow );
 7875 %}
 7876 
 7877 // --------------------------------- ABS --------------------------------------
 7878 // a = |a|
 7879 instruct vabsB_reg(vec dst, vec src) %{
 7880   match(Set dst (AbsVB  src));
 7881   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7882   ins_encode %{
 7883     uint vlen = Matcher::vector_length(this);
 7884     if (vlen <= 16) {
 7885       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7886     } else {
 7887       int vlen_enc = vector_length_encoding(this);
 7888       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7889     }
 7890   %}
 7891   ins_pipe( pipe_slow );
 7892 %}
 7893 
 7894 instruct vabsS_reg(vec dst, vec src) %{
 7895   match(Set dst (AbsVS  src));
 7896   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7897   ins_encode %{
 7898     uint vlen = Matcher::vector_length(this);
 7899     if (vlen <= 8) {
 7900       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7901     } else {
 7902       int vlen_enc = vector_length_encoding(this);
 7903       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7904     }
 7905   %}
 7906   ins_pipe( pipe_slow );
 7907 %}
 7908 
 7909 instruct vabsI_reg(vec dst, vec src) %{
 7910   match(Set dst (AbsVI  src));
 7911   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 7912   ins_encode %{
 7913     uint vlen = Matcher::vector_length(this);
 7914     if (vlen <= 4) {
 7915       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 7916     } else {
 7917       int vlen_enc = vector_length_encoding(this);
 7918       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7919     }
 7920   %}
 7921   ins_pipe( pipe_slow );
 7922 %}
 7923 
 7924 instruct vabsL_reg(vec dst, vec src) %{
 7925   match(Set dst (AbsVL  src));
 7926   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 7927   ins_encode %{
 7928     assert(UseAVX > 2, "required");
 7929     int vlen_enc = vector_length_encoding(this);
 7930     if (!VM_Version::supports_avx512vl()) {
 7931       vlen_enc = Assembler::AVX_512bit;
 7932     }
 7933     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7934   %}
 7935   ins_pipe( pipe_slow );
 7936 %}
 7937 
 7938 // --------------------------------- ABSNEG --------------------------------------
 7939 
 7940 instruct vabsnegF(vec dst, vec src) %{
 7941   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 7942   match(Set dst (AbsVF src));
 7943   match(Set dst (NegVF src));
 7944   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 7945   ins_cost(150);
 7946   ins_encode %{
 7947     int opcode = this->ideal_Opcode();
 7948     int vlen = Matcher::vector_length(this);
 7949     if (vlen == 2) {
 7950       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 7951     } else {
 7952       assert(vlen == 8 || vlen == 16, "required");
 7953       int vlen_enc = vector_length_encoding(this);
 7954       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7955     }
 7956   %}
 7957   ins_pipe( pipe_slow );
 7958 %}
 7959 
 7960 instruct vabsneg4F(vec dst) %{
 7961   predicate(Matcher::vector_length(n) == 4);
 7962   match(Set dst (AbsVF dst));
 7963   match(Set dst (NegVF dst));
 7964   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 7965   ins_cost(150);
 7966   ins_encode %{
 7967     int opcode = this->ideal_Opcode();
 7968     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 7969   %}
 7970   ins_pipe( pipe_slow );
 7971 %}
 7972 
 7973 instruct vabsnegD(vec dst, vec src) %{
 7974   match(Set dst (AbsVD  src));
 7975   match(Set dst (NegVD  src));
 7976   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 7977   ins_encode %{
 7978     int opcode = this->ideal_Opcode();
 7979     uint vlen = Matcher::vector_length(this);
 7980     if (vlen == 2) {
 7981       assert(UseSSE >= 2, "required");
 7982       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 7983     } else {
 7984       int vlen_enc = vector_length_encoding(this);
 7985       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7986     }
 7987   %}
 7988   ins_pipe( pipe_slow );
 7989 %}
 7990 
 7991 //------------------------------------- VectorTest --------------------------------------------
 7992 
 7993 #ifdef _LP64
 7994 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 7995   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 7996   match(Set cr (VectorTest src1 src2));
 7997   effect(TEMP vtmp);
 7998   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 7999   ins_encode %{
 8000     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8001     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8002     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8003   %}
 8004   ins_pipe( pipe_slow );
 8005 %}
 8006 
 8007 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8008   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8009   match(Set cr (VectorTest src1 src2));
 8010   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8011   ins_encode %{
 8012     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8013     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8014     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8015   %}
 8016   ins_pipe( pipe_slow );
 8017 %}
 8018 
 8019 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8020   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8021              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8022             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8023   match(Set cr (VectorTest src1 src2));
 8024   effect(TEMP tmp);
 8025   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8026   ins_encode %{
 8027     uint masklen = Matcher::vector_length(this, $src1);
 8028     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8029     __ andl($tmp$$Register, (1 << masklen) - 1);
 8030     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8031   %}
 8032   ins_pipe( pipe_slow );
 8033 %}
 8034 
 8035 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8036   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8037              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8038             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8039   match(Set cr (VectorTest src1 src2));
 8040   effect(TEMP tmp);
 8041   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8042   ins_encode %{
 8043     uint masklen = Matcher::vector_length(this, $src1);
 8044     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8045     __ andl($tmp$$Register, (1 << masklen) - 1);
 8046   %}
 8047   ins_pipe( pipe_slow );
 8048 %}
 8049 
 8050 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8051   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8052             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8053   match(Set cr (VectorTest src1 src2));
 8054   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8055   ins_encode %{
 8056     uint masklen = Matcher::vector_length(this, $src1);
 8057     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8058   %}
 8059   ins_pipe( pipe_slow );
 8060 %}
 8061 #endif
 8062 
 8063 //------------------------------------- LoadMask --------------------------------------------
 8064 
 8065 instruct loadMask(legVec dst, legVec src) %{
 8066   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
 8067   match(Set dst (VectorLoadMask src));
 8068   effect(TEMP dst);
 8069   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8070   ins_encode %{
 8071     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8072     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8073     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8074   %}
 8075   ins_pipe( pipe_slow );
 8076 %}
 8077 
 8078 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8079   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8080   match(Set dst (VectorLoadMask src));
 8081   effect(TEMP xtmp);
 8082   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8083   ins_encode %{
 8084     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8085                         true, Assembler::AVX_512bit);
 8086   %}
 8087   ins_pipe( pipe_slow );
 8088 %}
 8089 
 8090 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8091   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8092   match(Set dst (VectorLoadMask src));
 8093   effect(TEMP xtmp);
 8094   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8095   ins_encode %{
 8096     int vlen_enc = vector_length_encoding(in(1));
 8097     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8098                         false, vlen_enc);
 8099   %}
 8100   ins_pipe( pipe_slow );
 8101 %}
 8102 
 8103 //------------------------------------- StoreMask --------------------------------------------
 8104 
 8105 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8106   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8107   match(Set dst (VectorStoreMask src size));
 8108   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8109   ins_encode %{
 8110     int vlen = Matcher::vector_length(this);
 8111     if (vlen <= 16 && UseAVX <= 2) {
 8112       assert(UseSSE >= 3, "required");
 8113       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8114     } else {
 8115       assert(UseAVX > 0, "required");
 8116       int src_vlen_enc = vector_length_encoding(this, $src);
 8117       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8118     }
 8119   %}
 8120   ins_pipe( pipe_slow );
 8121 %}
 8122 
 8123 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8124   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8125   match(Set dst (VectorStoreMask src size));
 8126   effect(TEMP_DEF dst, TEMP xtmp);
 8127   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8128   ins_encode %{
 8129     int vlen_enc = Assembler::AVX_128bit;
 8130     int vlen = Matcher::vector_length(this);
 8131     if (vlen <= 8) {
 8132       assert(UseSSE >= 3, "required");
 8133       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8134       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8135       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8136     } else {
 8137       assert(UseAVX > 0, "required");
 8138       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8139       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8140       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8141     }
 8142   %}
 8143   ins_pipe( pipe_slow );
 8144 %}
 8145 
 8146 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8147   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8148   match(Set dst (VectorStoreMask src size));
 8149   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8150   effect(TEMP_DEF dst, TEMP xtmp);
 8151   ins_encode %{
 8152     int vlen_enc = Assembler::AVX_128bit;
 8153     int vlen = Matcher::vector_length(this);
 8154     if (vlen <= 4) {
 8155       assert(UseSSE >= 3, "required");
 8156       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8157       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8158       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8159       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8160     } else {
 8161       assert(UseAVX > 0, "required");
 8162       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8163       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8164       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8165       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8166       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8167     }
 8168   %}
 8169   ins_pipe( pipe_slow );
 8170 %}
 8171 
 8172 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8173   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8174   match(Set dst (VectorStoreMask src size));
 8175   effect(TEMP_DEF dst, TEMP xtmp);
 8176   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8177   ins_encode %{
 8178     assert(UseSSE >= 3, "required");
 8179     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8180     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8181     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8182     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8183     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8184   %}
 8185   ins_pipe( pipe_slow );
 8186 %}
 8187 
 8188 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8189   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8190   match(Set dst (VectorStoreMask src size));
 8191   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8192   effect(TEMP_DEF dst, TEMP vtmp);
 8193   ins_encode %{
 8194     int vlen_enc = Assembler::AVX_128bit;
 8195     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8196     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8197     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8198     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8199     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8200     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8201     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8202   %}
 8203   ins_pipe( pipe_slow );
 8204 %}
 8205 
 8206 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8207   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8208   match(Set dst (VectorStoreMask src size));
 8209   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8210   ins_encode %{
 8211     int src_vlen_enc = vector_length_encoding(this, $src);
 8212     int dst_vlen_enc = vector_length_encoding(this);
 8213     if (!VM_Version::supports_avx512vl()) {
 8214       src_vlen_enc = Assembler::AVX_512bit;
 8215     }
 8216     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8217     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8218   %}
 8219   ins_pipe( pipe_slow );
 8220 %}
 8221 
 8222 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8223   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8224   match(Set dst (VectorStoreMask src size));
 8225   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8226   ins_encode %{
 8227     int src_vlen_enc = vector_length_encoding(this, $src);
 8228     int dst_vlen_enc = vector_length_encoding(this);
 8229     if (!VM_Version::supports_avx512vl()) {
 8230       src_vlen_enc = Assembler::AVX_512bit;
 8231     }
 8232     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8233     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8234   %}
 8235   ins_pipe( pipe_slow );
 8236 %}
 8237 
 8238 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8239   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8240   match(Set dst (VectorStoreMask mask size));
 8241   effect(TEMP_DEF dst);
 8242   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8243   ins_encode %{
 8244     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8245     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8246                  false, Assembler::AVX_512bit, noreg);
 8247     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8248   %}
 8249   ins_pipe( pipe_slow );
 8250 %}
 8251 
 8252 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8253   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8254   match(Set dst (VectorStoreMask mask size));
 8255   effect(TEMP_DEF dst);
 8256   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8257   ins_encode %{
 8258     int dst_vlen_enc = vector_length_encoding(this);
 8259     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8260     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8261   %}
 8262   ins_pipe( pipe_slow );
 8263 %}
 8264 
 8265 instruct vmaskcast_evex(kReg dst) %{
 8266   match(Set dst (VectorMaskCast dst));
 8267   ins_cost(0);
 8268   format %{ "vector_mask_cast $dst" %}
 8269   ins_encode %{
 8270     // empty
 8271   %}
 8272   ins_pipe(empty);
 8273 %}
 8274 
 8275 instruct vmaskcast(vec dst) %{
 8276   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8277   match(Set dst (VectorMaskCast dst));
 8278   ins_cost(0);
 8279   format %{ "vector_mask_cast $dst" %}
 8280   ins_encode %{
 8281     // empty
 8282   %}
 8283   ins_pipe(empty);
 8284 %}
 8285 
 8286 instruct vmaskcast_avx(vec dst, vec src) %{
 8287   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8288   match(Set dst (VectorMaskCast src));
 8289   format %{ "vector_mask_cast $dst, $src" %}
 8290   ins_encode %{
 8291     int vlen = Matcher::vector_length(this);
 8292     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8293     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8294     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8295   %}
 8296   ins_pipe(pipe_slow);
 8297 %}
 8298 
 8299 //-------------------------------- Load Iota Indices ----------------------------------
 8300 
 8301 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8302   match(Set dst (VectorLoadConst src));
 8303   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8304   ins_encode %{
 8305      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8306      BasicType bt = Matcher::vector_element_basic_type(this);
 8307      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8308   %}
 8309   ins_pipe( pipe_slow );
 8310 %}
 8311 
 8312 #ifdef _LP64
 8313 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8314   match(Set dst (PopulateIndex src1 src2));
 8315   effect(TEMP dst, TEMP vtmp);
 8316   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8317   ins_encode %{
 8318      assert($src2$$constant == 1, "required");
 8319      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8320      int vlen_enc = vector_length_encoding(this);
 8321      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8322      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8323      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8324      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8325   %}
 8326   ins_pipe( pipe_slow );
 8327 %}
 8328 
 8329 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8330   match(Set dst (PopulateIndex src1 src2));
 8331   effect(TEMP dst, TEMP vtmp);
 8332   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8333   ins_encode %{
 8334      assert($src2$$constant == 1, "required");
 8335      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8336      int vlen_enc = vector_length_encoding(this);
 8337      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8338      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8339      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8340      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8341   %}
 8342   ins_pipe( pipe_slow );
 8343 %}
 8344 #endif
 8345 //-------------------------------- Rearrange ----------------------------------
 8346 
 8347 // LoadShuffle/Rearrange for Byte
 8348 
 8349 instruct loadShuffleB(vec dst) %{
 8350   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8351   match(Set dst (VectorLoadShuffle dst));
 8352   format %{ "vector_load_shuffle $dst, $dst" %}
 8353   ins_encode %{
 8354     // empty
 8355   %}
 8356   ins_pipe( pipe_slow );
 8357 %}
 8358 
 8359 instruct rearrangeB(vec dst, vec shuffle) %{
 8360   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8361             Matcher::vector_length(n) < 32);
 8362   match(Set dst (VectorRearrange dst shuffle));
 8363   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8364   ins_encode %{
 8365     assert(UseSSE >= 4, "required");
 8366     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8367   %}
 8368   ins_pipe( pipe_slow );
 8369 %}
 8370 
 8371 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8372   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8373             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8374   match(Set dst (VectorRearrange src shuffle));
 8375   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8376   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8377   ins_encode %{
 8378     assert(UseAVX >= 2, "required");
 8379     // Swap src into vtmp1
 8380     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8381     // Shuffle swapped src to get entries from other 128 bit lane
 8382     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8383     // Shuffle original src to get entries from self 128 bit lane
 8384     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8385     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8386     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8387     // Perform the blend
 8388     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8389   %}
 8390   ins_pipe( pipe_slow );
 8391 %}
 8392 
 8393 
 8394 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8395   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8396             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8397   match(Set dst (VectorRearrange src shuffle));
 8398   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8399   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8400   ins_encode %{
 8401     int vlen_enc = vector_length_encoding(this);
 8402     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8403                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8404                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8405   %}
 8406   ins_pipe( pipe_slow );
 8407 %}
 8408 
 8409 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8410   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8411             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8412   match(Set dst (VectorRearrange src shuffle));
 8413   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8414   ins_encode %{
 8415     int vlen_enc = vector_length_encoding(this);
 8416     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8417   %}
 8418   ins_pipe( pipe_slow );
 8419 %}
 8420 
 8421 // LoadShuffle/Rearrange for Short
 8422 
 8423 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8424   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8425             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8426   match(Set dst (VectorLoadShuffle src));
 8427   effect(TEMP dst, TEMP vtmp);
 8428   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8429   ins_encode %{
 8430     // Create a byte shuffle mask from short shuffle mask
 8431     // only byte shuffle instruction available on these platforms
 8432     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8433     if (UseAVX == 0) {
 8434       assert(vlen_in_bytes <= 16, "required");
 8435       // Multiply each shuffle by two to get byte index
 8436       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8437       __ psllw($vtmp$$XMMRegister, 1);
 8438 
 8439       // Duplicate to create 2 copies of byte index
 8440       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8441       __ psllw($dst$$XMMRegister, 8);
 8442       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8443 
 8444       // Add one to get alternate byte index
 8445       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8446       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8447     } else {
 8448       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8449       int vlen_enc = vector_length_encoding(this);
 8450       // Multiply each shuffle by two to get byte index
 8451       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8452       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8453 
 8454       // Duplicate to create 2 copies of byte index
 8455       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8456       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8457 
 8458       // Add one to get alternate byte index
 8459       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8460     }
 8461   %}
 8462   ins_pipe( pipe_slow );
 8463 %}
 8464 
 8465 instruct rearrangeS(vec dst, vec shuffle) %{
 8466   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8467             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8468   match(Set dst (VectorRearrange dst shuffle));
 8469   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8470   ins_encode %{
 8471     assert(UseSSE >= 4, "required");
 8472     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8473   %}
 8474   ins_pipe( pipe_slow );
 8475 %}
 8476 
 8477 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8478   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8479             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8480   match(Set dst (VectorRearrange src shuffle));
 8481   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8482   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8483   ins_encode %{
 8484     assert(UseAVX >= 2, "required");
 8485     // Swap src into vtmp1
 8486     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8487     // Shuffle swapped src to get entries from other 128 bit lane
 8488     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8489     // Shuffle original src to get entries from self 128 bit lane
 8490     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8491     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8492     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8493     // Perform the blend
 8494     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8495   %}
 8496   ins_pipe( pipe_slow );
 8497 %}
 8498 
 8499 instruct loadShuffleS_evex(vec dst, vec src) %{
 8500   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8501             VM_Version::supports_avx512bw());
 8502   match(Set dst (VectorLoadShuffle src));
 8503   format %{ "vector_load_shuffle $dst, $src" %}
 8504   ins_encode %{
 8505     int vlen_enc = vector_length_encoding(this);
 8506     if (!VM_Version::supports_avx512vl()) {
 8507       vlen_enc = Assembler::AVX_512bit;
 8508     }
 8509     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8510   %}
 8511   ins_pipe( pipe_slow );
 8512 %}
 8513 
 8514 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8515   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8516             VM_Version::supports_avx512bw());
 8517   match(Set dst (VectorRearrange src shuffle));
 8518   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8519   ins_encode %{
 8520     int vlen_enc = vector_length_encoding(this);
 8521     if (!VM_Version::supports_avx512vl()) {
 8522       vlen_enc = Assembler::AVX_512bit;
 8523     }
 8524     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8525   %}
 8526   ins_pipe( pipe_slow );
 8527 %}
 8528 
 8529 // LoadShuffle/Rearrange for Integer and Float
 8530 
 8531 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8532   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8533             Matcher::vector_length(n) == 4 && UseAVX == 0);
 8534   match(Set dst (VectorLoadShuffle src));
 8535   effect(TEMP dst, TEMP vtmp);
 8536   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8537   ins_encode %{
 8538     assert(UseSSE >= 4, "required");
 8539 
 8540     // Create a byte shuffle mask from int shuffle mask
 8541     // only byte shuffle instruction available on these platforms
 8542 
 8543     // Duplicate and multiply each shuffle by 4
 8544     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8545     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8546     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8547     __ psllw($vtmp$$XMMRegister, 2);
 8548 
 8549     // Duplicate again to create 4 copies of byte index
 8550     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8551     __ psllw($dst$$XMMRegister, 8);
 8552     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8553 
 8554     // Add 3,2,1,0 to get alternate byte index
 8555     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8556     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8557   %}
 8558   ins_pipe( pipe_slow );
 8559 %}
 8560 
 8561 instruct rearrangeI(vec dst, vec shuffle) %{
 8562   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8563             UseAVX == 0);
 8564   match(Set dst (VectorRearrange dst shuffle));
 8565   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8566   ins_encode %{
 8567     assert(UseSSE >= 4, "required");
 8568     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8569   %}
 8570   ins_pipe( pipe_slow );
 8571 %}
 8572 
 8573 instruct loadShuffleI_avx(vec dst, vec src) %{
 8574   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8575             UseAVX > 0);
 8576   match(Set dst (VectorLoadShuffle src));
 8577   format %{ "vector_load_shuffle $dst, $src" %}
 8578   ins_encode %{
 8579     int vlen_enc = vector_length_encoding(this);
 8580     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8581   %}
 8582   ins_pipe( pipe_slow );
 8583 %}
 8584 
 8585 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8586   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8587             UseAVX > 0);
 8588   match(Set dst (VectorRearrange src shuffle));
 8589   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8590   ins_encode %{
 8591     int vlen_enc = vector_length_encoding(this);
 8592     BasicType bt = Matcher::vector_element_basic_type(this);
 8593     __ vector_rearrange_int_float(bt, $dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8594   %}
 8595   ins_pipe( pipe_slow );
 8596 %}
 8597 
 8598 // LoadShuffle/Rearrange for Long and Double
 8599 
 8600 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8601   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8602             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8603   match(Set dst (VectorLoadShuffle src));
 8604   effect(TEMP dst, TEMP vtmp);
 8605   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8606   ins_encode %{
 8607     assert(UseAVX >= 2, "required");
 8608 
 8609     int vlen_enc = vector_length_encoding(this);
 8610     // Create a double word shuffle mask from long shuffle mask
 8611     // only double word shuffle instruction available on these platforms
 8612 
 8613     // Multiply each shuffle by two to get double word index
 8614     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8615     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8616 
 8617     // Duplicate each double word shuffle
 8618     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8619     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8620 
 8621     // Add one to get alternate double word index
 8622     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8623   %}
 8624   ins_pipe( pipe_slow );
 8625 %}
 8626 
 8627 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8628   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8629             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8630   match(Set dst (VectorRearrange src shuffle));
 8631   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8632   ins_encode %{
 8633     assert(UseAVX >= 2, "required");
 8634 
 8635     int vlen_enc = vector_length_encoding(this);
 8636     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8637   %}
 8638   ins_pipe( pipe_slow );
 8639 %}
 8640 
 8641 instruct loadShuffleL_evex(vec dst, vec src) %{
 8642   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8643             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8644   match(Set dst (VectorLoadShuffle src));
 8645   format %{ "vector_load_shuffle $dst, $src" %}
 8646   ins_encode %{
 8647     assert(UseAVX > 2, "required");
 8648 
 8649     int vlen_enc = vector_length_encoding(this);
 8650     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8651   %}
 8652   ins_pipe( pipe_slow );
 8653 %}
 8654 
 8655 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8656   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8657             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8658   match(Set dst (VectorRearrange src shuffle));
 8659   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8660   ins_encode %{
 8661     assert(UseAVX > 2, "required");
 8662 
 8663     int vlen_enc = vector_length_encoding(this);
 8664     if (vlen_enc == Assembler::AVX_128bit) {
 8665       vlen_enc = Assembler::AVX_256bit;
 8666     }
 8667     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8668   %}
 8669   ins_pipe( pipe_slow );
 8670 %}
 8671 
 8672 // --------------------------------- FMA --------------------------------------
 8673 // a * b + c
 8674 
 8675 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8676   match(Set c (FmaVF  c (Binary a b)));
 8677   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8678   ins_cost(150);
 8679   ins_encode %{
 8680     assert(UseFMA, "not enabled");
 8681     int vlen_enc = vector_length_encoding(this);
 8682     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8683   %}
 8684   ins_pipe( pipe_slow );
 8685 %}
 8686 
 8687 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8688   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8689   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8690   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8691   ins_cost(150);
 8692   ins_encode %{
 8693     assert(UseFMA, "not enabled");
 8694     int vlen_enc = vector_length_encoding(this);
 8695     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8696   %}
 8697   ins_pipe( pipe_slow );
 8698 %}
 8699 
 8700 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8701   match(Set c (FmaVD  c (Binary a b)));
 8702   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8703   ins_cost(150);
 8704   ins_encode %{
 8705     assert(UseFMA, "not enabled");
 8706     int vlen_enc = vector_length_encoding(this);
 8707     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8708   %}
 8709   ins_pipe( pipe_slow );
 8710 %}
 8711 
 8712 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8713   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8714   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8715   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8716   ins_cost(150);
 8717   ins_encode %{
 8718     assert(UseFMA, "not enabled");
 8719     int vlen_enc = vector_length_encoding(this);
 8720     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8721   %}
 8722   ins_pipe( pipe_slow );
 8723 %}
 8724 
 8725 // --------------------------------- Vector Multiply Add --------------------------------------
 8726 
 8727 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8728   predicate(UseAVX == 0);
 8729   match(Set dst (MulAddVS2VI dst src1));
 8730   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8731   ins_encode %{
 8732     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8733   %}
 8734   ins_pipe( pipe_slow );
 8735 %}
 8736 
 8737 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8738   predicate(UseAVX > 0);
 8739   match(Set dst (MulAddVS2VI src1 src2));
 8740   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8741   ins_encode %{
 8742     int vlen_enc = vector_length_encoding(this);
 8743     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8744   %}
 8745   ins_pipe( pipe_slow );
 8746 %}
 8747 
 8748 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8749 
 8750 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8751   predicate(VM_Version::supports_avx512_vnni());
 8752   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8753   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8754   ins_encode %{
 8755     assert(UseAVX > 2, "required");
 8756     int vlen_enc = vector_length_encoding(this);
 8757     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8758   %}
 8759   ins_pipe( pipe_slow );
 8760   ins_cost(10);
 8761 %}
 8762 
 8763 // --------------------------------- PopCount --------------------------------------
 8764 
 8765 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8766   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8767   match(Set dst (PopCountVI src));
 8768   match(Set dst (PopCountVL src));
 8769   format %{ "vector_popcount_integral $dst, $src" %}
 8770   ins_encode %{
 8771     int opcode = this->ideal_Opcode();
 8772     int vlen_enc = vector_length_encoding(this, $src);
 8773     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8774     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8775   %}
 8776   ins_pipe( pipe_slow );
 8777 %}
 8778 
 8779 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8780   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8781   match(Set dst (PopCountVI src mask));
 8782   match(Set dst (PopCountVL src mask));
 8783   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8784   ins_encode %{
 8785     int vlen_enc = vector_length_encoding(this, $src);
 8786     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8787     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8788     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8789   %}
 8790   ins_pipe( pipe_slow );
 8791 %}
 8792 
 8793 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8794   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8795   match(Set dst (PopCountVI src));
 8796   match(Set dst (PopCountVL src));
 8797   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8798   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8799   ins_encode %{
 8800     int opcode = this->ideal_Opcode();
 8801     int vlen_enc = vector_length_encoding(this, $src);
 8802     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8803     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8804                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8805   %}
 8806   ins_pipe( pipe_slow );
 8807 %}
 8808 
 8809 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8810 
 8811 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8812   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8813                                               Matcher::vector_length_in_bytes(n->in(1))));
 8814   match(Set dst (CountTrailingZerosV src));
 8815   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8816   ins_cost(400);
 8817   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8818   ins_encode %{
 8819     int vlen_enc = vector_length_encoding(this, $src);
 8820     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8821     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8822                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8823   %}
 8824   ins_pipe( pipe_slow );
 8825 %}
 8826 
 8827 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8828   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8829             VM_Version::supports_avx512cd() &&
 8830             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8831   match(Set dst (CountTrailingZerosV src));
 8832   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8833   ins_cost(400);
 8834   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8835   ins_encode %{
 8836     int vlen_enc = vector_length_encoding(this, $src);
 8837     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8838     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8839                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8840   %}
 8841   ins_pipe( pipe_slow );
 8842 %}
 8843 
 8844 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8845   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8846   match(Set dst (CountTrailingZerosV src));
 8847   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8848   ins_cost(400);
 8849   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8850   ins_encode %{
 8851     int vlen_enc = vector_length_encoding(this, $src);
 8852     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8853     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8854                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8855                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8856   %}
 8857   ins_pipe( pipe_slow );
 8858 %}
 8859 
 8860 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8861   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8862   match(Set dst (CountTrailingZerosV src));
 8863   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8864   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8865   ins_encode %{
 8866     int vlen_enc = vector_length_encoding(this, $src);
 8867     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8868     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8869                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8870   %}
 8871   ins_pipe( pipe_slow );
 8872 %}
 8873 
 8874 
 8875 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8876 
 8877 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8878   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8879   effect(TEMP dst);
 8880   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8881   ins_encode %{
 8882     int vector_len = vector_length_encoding(this);
 8883     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8884   %}
 8885   ins_pipe( pipe_slow );
 8886 %}
 8887 
 8888 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8889   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8890   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8891   effect(TEMP dst);
 8892   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8893   ins_encode %{
 8894     int vector_len = vector_length_encoding(this);
 8895     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8896   %}
 8897   ins_pipe( pipe_slow );
 8898 %}
 8899 
 8900 // --------------------------------- Rotation Operations ----------------------------------
 8901 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8902   match(Set dst (RotateLeftV src shift));
 8903   match(Set dst (RotateRightV src shift));
 8904   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8905   ins_encode %{
 8906     int opcode      = this->ideal_Opcode();
 8907     int vector_len  = vector_length_encoding(this);
 8908     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8909     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 8910   %}
 8911   ins_pipe( pipe_slow );
 8912 %}
 8913 
 8914 instruct vprorate(vec dst, vec src, vec shift) %{
 8915   match(Set dst (RotateLeftV src shift));
 8916   match(Set dst (RotateRightV src shift));
 8917   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 8918   ins_encode %{
 8919     int opcode      = this->ideal_Opcode();
 8920     int vector_len  = vector_length_encoding(this);
 8921     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8922     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 8923   %}
 8924   ins_pipe( pipe_slow );
 8925 %}
 8926 
 8927 // ---------------------------------- Masked Operations ------------------------------------
 8928 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 8929   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 8930   match(Set dst (LoadVectorMasked mem mask));
 8931   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8932   ins_encode %{
 8933     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 8934     int vlen_enc = vector_length_encoding(this);
 8935     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 8936   %}
 8937   ins_pipe( pipe_slow );
 8938 %}
 8939 
 8940 
 8941 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 8942   predicate(n->in(3)->bottom_type()->isa_vectmask());
 8943   match(Set dst (LoadVectorMasked mem mask));
 8944   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8945   ins_encode %{
 8946     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 8947     int vector_len = vector_length_encoding(this);
 8948     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 8949   %}
 8950   ins_pipe( pipe_slow );
 8951 %}
 8952 
 8953 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 8954   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8955   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8956   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8957   ins_encode %{
 8958     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8959     int vlen_enc = vector_length_encoding(src_node);
 8960     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8961     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 8962   %}
 8963   ins_pipe( pipe_slow );
 8964 %}
 8965 
 8966 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 8967   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 8968   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8969   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8970   ins_encode %{
 8971     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8972     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8973     int vlen_enc = vector_length_encoding(src_node);
 8974     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 8975   %}
 8976   ins_pipe( pipe_slow );
 8977 %}
 8978 
 8979 #ifdef _LP64
 8980 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 8981   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 8982   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 8983   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 8984   ins_encode %{
 8985     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 8986     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 8987 
 8988     Label DONE;
 8989     int vlen_enc = vector_length_encoding(this, $src1);
 8990     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8991 
 8992     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 8993     __ mov64($dst$$Register, -1L);
 8994     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 8995     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 8996     __ jccb(Assembler::carrySet, DONE);
 8997     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 8998     __ notq($dst$$Register);
 8999     __ tzcntq($dst$$Register, $dst$$Register);
 9000     __ bind(DONE);
 9001   %}
 9002   ins_pipe( pipe_slow );
 9003 %}
 9004 
 9005 
 9006 instruct vmask_gen(kReg dst, rRegL len, rRegL temp, rFlagsReg cr) %{
 9007   match(Set dst (VectorMaskGen len));
 9008   effect(TEMP temp, KILL cr);
 9009   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9010   ins_encode %{
 9011     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9012   %}
 9013   ins_pipe( pipe_slow );
 9014 %}
 9015 
 9016 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9017   match(Set dst (VectorMaskGen len));
 9018   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9019   effect(TEMP temp);
 9020   ins_encode %{
 9021     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9022     __ kmovql($dst$$KRegister, $temp$$Register);
 9023   %}
 9024   ins_pipe( pipe_slow );
 9025 %}
 9026 
 9027 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9028   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9029   match(Set dst (VectorMaskToLong mask));
 9030   effect(TEMP dst, KILL cr);
 9031   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9032   ins_encode %{
 9033     int opcode = this->ideal_Opcode();
 9034     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9035     int mask_len = Matcher::vector_length(this, $mask);
 9036     int mask_size = mask_len * type2aelembytes(mbt);
 9037     int vlen_enc = vector_length_encoding(this, $mask);
 9038     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9039                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9040   %}
 9041   ins_pipe( pipe_slow );
 9042 %}
 9043 
 9044 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9045   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9046   match(Set dst (VectorMaskToLong mask));
 9047   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9048   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9049   ins_encode %{
 9050     int opcode = this->ideal_Opcode();
 9051     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9052     int mask_len = Matcher::vector_length(this, $mask);
 9053     int vlen_enc = vector_length_encoding(this, $mask);
 9054     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9055                              $dst$$Register, mask_len, mbt, vlen_enc);
 9056   %}
 9057   ins_pipe( pipe_slow );
 9058 %}
 9059 
 9060 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9061   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9062   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9063   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9064   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9065   ins_encode %{
 9066     int opcode = this->ideal_Opcode();
 9067     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9068     int mask_len = Matcher::vector_length(this, $mask);
 9069     int vlen_enc = vector_length_encoding(this, $mask);
 9070     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9071                              $dst$$Register, mask_len, mbt, vlen_enc);
 9072   %}
 9073   ins_pipe( pipe_slow );
 9074 %}
 9075 
 9076 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9077   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9078   match(Set dst (VectorMaskTrueCount mask));
 9079   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9080   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9081   ins_encode %{
 9082     int opcode = this->ideal_Opcode();
 9083     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9084     int mask_len = Matcher::vector_length(this, $mask);
 9085     int mask_size = mask_len * type2aelembytes(mbt);
 9086     int vlen_enc = vector_length_encoding(this, $mask);
 9087     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9088                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9089   %}
 9090   ins_pipe( pipe_slow );
 9091 %}
 9092 
 9093 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9094   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9095   match(Set dst (VectorMaskTrueCount mask));
 9096   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9097   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9098   ins_encode %{
 9099     int opcode = this->ideal_Opcode();
 9100     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9101     int mask_len = Matcher::vector_length(this, $mask);
 9102     int vlen_enc = vector_length_encoding(this, $mask);
 9103     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9104                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9105   %}
 9106   ins_pipe( pipe_slow );
 9107 %}
 9108 
 9109 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9110   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9111   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9112   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9113   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9114   ins_encode %{
 9115     int opcode = this->ideal_Opcode();
 9116     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9117     int mask_len = Matcher::vector_length(this, $mask);
 9118     int vlen_enc = vector_length_encoding(this, $mask);
 9119     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9120                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9121   %}
 9122   ins_pipe( pipe_slow );
 9123 %}
 9124 
 9125 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9126   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9127   match(Set dst (VectorMaskFirstTrue mask));
 9128   match(Set dst (VectorMaskLastTrue mask));
 9129   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9130   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9131   ins_encode %{
 9132     int opcode = this->ideal_Opcode();
 9133     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9134     int mask_len = Matcher::vector_length(this, $mask);
 9135     int mask_size = mask_len * type2aelembytes(mbt);
 9136     int vlen_enc = vector_length_encoding(this, $mask);
 9137     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9138                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9139   %}
 9140   ins_pipe( pipe_slow );
 9141 %}
 9142 
 9143 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9144   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9145   match(Set dst (VectorMaskFirstTrue mask));
 9146   match(Set dst (VectorMaskLastTrue mask));
 9147   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9148   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9149   ins_encode %{
 9150     int opcode = this->ideal_Opcode();
 9151     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9152     int mask_len = Matcher::vector_length(this, $mask);
 9153     int vlen_enc = vector_length_encoding(this, $mask);
 9154     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9155                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9156   %}
 9157   ins_pipe( pipe_slow );
 9158 %}
 9159 
 9160 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9161   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9162   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9163   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9164   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9165   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9166   ins_encode %{
 9167     int opcode = this->ideal_Opcode();
 9168     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9169     int mask_len = Matcher::vector_length(this, $mask);
 9170     int vlen_enc = vector_length_encoding(this, $mask);
 9171     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9172                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9173   %}
 9174   ins_pipe( pipe_slow );
 9175 %}
 9176 
 9177 // --------------------------------- Compress/Expand Operations ---------------------------
 9178 
 9179 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9180   match(Set dst (CompressV src mask));
 9181   match(Set dst (ExpandV src mask));
 9182   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9183   ins_encode %{
 9184     int opcode = this->ideal_Opcode();
 9185     int vector_len = vector_length_encoding(this);
 9186     BasicType bt  = Matcher::vector_element_basic_type(this);
 9187     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9188   %}
 9189   ins_pipe( pipe_slow );
 9190 %}
 9191 
 9192 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9193   match(Set dst (CompressM mask));
 9194   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9195   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9196   ins_encode %{
 9197     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9198     int mask_len = Matcher::vector_length(this);
 9199     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9200   %}
 9201   ins_pipe( pipe_slow );
 9202 %}
 9203 
 9204 #endif // _LP64
 9205 
 9206 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9207 
 9208 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9209   predicate(!VM_Version::supports_gfni());
 9210   match(Set dst (ReverseV src));
 9211   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9212   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9213   ins_encode %{
 9214     int vec_enc = vector_length_encoding(this);
 9215     BasicType bt = Matcher::vector_element_basic_type(this);
 9216     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9217                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9218   %}
 9219   ins_pipe( pipe_slow );
 9220 %}
 9221 
 9222 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9223   predicate(VM_Version::supports_gfni());
 9224   match(Set dst (ReverseV src));
 9225   effect(TEMP dst, TEMP xtmp);
 9226   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9227   ins_encode %{
 9228     int vec_enc = vector_length_encoding(this);
 9229     BasicType bt  = Matcher::vector_element_basic_type(this);
 9230     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9231     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9232                                $xtmp$$XMMRegister);
 9233   %}
 9234   ins_pipe( pipe_slow );
 9235 %}
 9236 
 9237 instruct vreverse_byte_reg(vec dst, vec src) %{
 9238   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9239   match(Set dst (ReverseBytesV src));
 9240   effect(TEMP dst);
 9241   format %{ "vector_reverse_byte $dst, $src" %}
 9242   ins_encode %{
 9243     int vec_enc = vector_length_encoding(this);
 9244     BasicType bt = Matcher::vector_element_basic_type(this);
 9245     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9246   %}
 9247   ins_pipe( pipe_slow );
 9248 %}
 9249 
 9250 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9251   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9252   match(Set dst (ReverseBytesV src));
 9253   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9254   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9255   ins_encode %{
 9256     int vec_enc = vector_length_encoding(this);
 9257     BasicType bt = Matcher::vector_element_basic_type(this);
 9258     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9259                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9260   %}
 9261   ins_pipe( pipe_slow );
 9262 %}
 9263 
 9264 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9265 
 9266 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9267   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9268                                               Matcher::vector_length_in_bytes(n->in(1))));
 9269   match(Set dst (CountLeadingZerosV src));
 9270   format %{ "vector_count_leading_zeros $dst, $src" %}
 9271   ins_encode %{
 9272      int vlen_enc = vector_length_encoding(this, $src);
 9273      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9274      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9275                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9276   %}
 9277   ins_pipe( pipe_slow );
 9278 %}
 9279 
 9280 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9281   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9282                                               Matcher::vector_length_in_bytes(n->in(1))));
 9283   match(Set dst (CountLeadingZerosV src mask));
 9284   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9285   ins_encode %{
 9286     int vlen_enc = vector_length_encoding(this, $src);
 9287     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9288     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9289     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9290                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9291   %}
 9292   ins_pipe( pipe_slow );
 9293 %}
 9294 
 9295 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9296   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9297             VM_Version::supports_avx512cd() &&
 9298             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9299   match(Set dst (CountLeadingZerosV src));
 9300   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9301   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9302   ins_encode %{
 9303     int vlen_enc = vector_length_encoding(this, $src);
 9304     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9305     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9306                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9307   %}
 9308   ins_pipe( pipe_slow );
 9309 %}
 9310 
 9311 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9312   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9313   match(Set dst (CountLeadingZerosV src));
 9314   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9315   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9316   ins_encode %{
 9317     int vlen_enc = vector_length_encoding(this, $src);
 9318     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9319     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9320                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9321                                        $rtmp$$Register, true, vlen_enc);
 9322   %}
 9323   ins_pipe( pipe_slow );
 9324 %}
 9325 
 9326 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9327   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9328             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9329   match(Set dst (CountLeadingZerosV src));
 9330   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9331   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9332   ins_encode %{
 9333     int vlen_enc = vector_length_encoding(this, $src);
 9334     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9335     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9336                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9337   %}
 9338   ins_pipe( pipe_slow );
 9339 %}
 9340 
 9341 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9342   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9343             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9344   match(Set dst (CountLeadingZerosV src));
 9345   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9346   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9347   ins_encode %{
 9348     int vlen_enc = vector_length_encoding(this, $src);
 9349     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9350     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9351                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9352   %}
 9353   ins_pipe( pipe_slow );
 9354 %}
 9355 
 9356 // ---------------------------------- Vector Masked Operations ------------------------------------
 9357 
 9358 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9359   match(Set dst (AddVB (Binary dst src2) mask));
 9360   match(Set dst (AddVS (Binary dst src2) mask));
 9361   match(Set dst (AddVI (Binary dst src2) mask));
 9362   match(Set dst (AddVL (Binary dst src2) mask));
 9363   match(Set dst (AddVF (Binary dst src2) mask));
 9364   match(Set dst (AddVD (Binary dst src2) mask));
 9365   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9366   ins_encode %{
 9367     int vlen_enc = vector_length_encoding(this);
 9368     BasicType bt = Matcher::vector_element_basic_type(this);
 9369     int opc = this->ideal_Opcode();
 9370     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9371                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9372   %}
 9373   ins_pipe( pipe_slow );
 9374 %}
 9375 
 9376 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9377   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9378   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9379   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9380   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9381   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9382   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9383   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9384   ins_encode %{
 9385     int vlen_enc = vector_length_encoding(this);
 9386     BasicType bt = Matcher::vector_element_basic_type(this);
 9387     int opc = this->ideal_Opcode();
 9388     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9389                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9390   %}
 9391   ins_pipe( pipe_slow );
 9392 %}
 9393 
 9394 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9395   match(Set dst (XorV (Binary dst src2) mask));
 9396   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9397   ins_encode %{
 9398     int vlen_enc = vector_length_encoding(this);
 9399     BasicType bt = Matcher::vector_element_basic_type(this);
 9400     int opc = this->ideal_Opcode();
 9401     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9402                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9403   %}
 9404   ins_pipe( pipe_slow );
 9405 %}
 9406 
 9407 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9408   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9409   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9410   ins_encode %{
 9411     int vlen_enc = vector_length_encoding(this);
 9412     BasicType bt = Matcher::vector_element_basic_type(this);
 9413     int opc = this->ideal_Opcode();
 9414     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9415                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9416   %}
 9417   ins_pipe( pipe_slow );
 9418 %}
 9419 
 9420 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9421   match(Set dst (OrV (Binary dst src2) mask));
 9422   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9423   ins_encode %{
 9424     int vlen_enc = vector_length_encoding(this);
 9425     BasicType bt = Matcher::vector_element_basic_type(this);
 9426     int opc = this->ideal_Opcode();
 9427     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9428                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9429   %}
 9430   ins_pipe( pipe_slow );
 9431 %}
 9432 
 9433 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9434   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9435   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9436   ins_encode %{
 9437     int vlen_enc = vector_length_encoding(this);
 9438     BasicType bt = Matcher::vector_element_basic_type(this);
 9439     int opc = this->ideal_Opcode();
 9440     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9441                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9442   %}
 9443   ins_pipe( pipe_slow );
 9444 %}
 9445 
 9446 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9447   match(Set dst (AndV (Binary dst src2) mask));
 9448   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9449   ins_encode %{
 9450     int vlen_enc = vector_length_encoding(this);
 9451     BasicType bt = Matcher::vector_element_basic_type(this);
 9452     int opc = this->ideal_Opcode();
 9453     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9454                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9455   %}
 9456   ins_pipe( pipe_slow );
 9457 %}
 9458 
 9459 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9460   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9461   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9462   ins_encode %{
 9463     int vlen_enc = vector_length_encoding(this);
 9464     BasicType bt = Matcher::vector_element_basic_type(this);
 9465     int opc = this->ideal_Opcode();
 9466     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9467                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9468   %}
 9469   ins_pipe( pipe_slow );
 9470 %}
 9471 
 9472 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9473   match(Set dst (SubVB (Binary dst src2) mask));
 9474   match(Set dst (SubVS (Binary dst src2) mask));
 9475   match(Set dst (SubVI (Binary dst src2) mask));
 9476   match(Set dst (SubVL (Binary dst src2) mask));
 9477   match(Set dst (SubVF (Binary dst src2) mask));
 9478   match(Set dst (SubVD (Binary dst src2) mask));
 9479   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9480   ins_encode %{
 9481     int vlen_enc = vector_length_encoding(this);
 9482     BasicType bt = Matcher::vector_element_basic_type(this);
 9483     int opc = this->ideal_Opcode();
 9484     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9485                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9486   %}
 9487   ins_pipe( pipe_slow );
 9488 %}
 9489 
 9490 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9491   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9492   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9493   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9494   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9495   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9496   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9497   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9498   ins_encode %{
 9499     int vlen_enc = vector_length_encoding(this);
 9500     BasicType bt = Matcher::vector_element_basic_type(this);
 9501     int opc = this->ideal_Opcode();
 9502     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9503                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9504   %}
 9505   ins_pipe( pipe_slow );
 9506 %}
 9507 
 9508 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9509   match(Set dst (MulVS (Binary dst src2) mask));
 9510   match(Set dst (MulVI (Binary dst src2) mask));
 9511   match(Set dst (MulVL (Binary dst src2) mask));
 9512   match(Set dst (MulVF (Binary dst src2) mask));
 9513   match(Set dst (MulVD (Binary dst src2) mask));
 9514   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9515   ins_encode %{
 9516     int vlen_enc = vector_length_encoding(this);
 9517     BasicType bt = Matcher::vector_element_basic_type(this);
 9518     int opc = this->ideal_Opcode();
 9519     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9520                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9521   %}
 9522   ins_pipe( pipe_slow );
 9523 %}
 9524 
 9525 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9526   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9527   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9528   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9529   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9530   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9531   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9532   ins_encode %{
 9533     int vlen_enc = vector_length_encoding(this);
 9534     BasicType bt = Matcher::vector_element_basic_type(this);
 9535     int opc = this->ideal_Opcode();
 9536     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9537                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9538   %}
 9539   ins_pipe( pipe_slow );
 9540 %}
 9541 
 9542 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9543   match(Set dst (SqrtVF dst mask));
 9544   match(Set dst (SqrtVD dst mask));
 9545   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9546   ins_encode %{
 9547     int vlen_enc = vector_length_encoding(this);
 9548     BasicType bt = Matcher::vector_element_basic_type(this);
 9549     int opc = this->ideal_Opcode();
 9550     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9551                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9552   %}
 9553   ins_pipe( pipe_slow );
 9554 %}
 9555 
 9556 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9557   match(Set dst (DivVF (Binary dst src2) mask));
 9558   match(Set dst (DivVD (Binary dst src2) mask));
 9559   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9560   ins_encode %{
 9561     int vlen_enc = vector_length_encoding(this);
 9562     BasicType bt = Matcher::vector_element_basic_type(this);
 9563     int opc = this->ideal_Opcode();
 9564     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9565                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9566   %}
 9567   ins_pipe( pipe_slow );
 9568 %}
 9569 
 9570 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9571   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9572   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9573   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9574   ins_encode %{
 9575     int vlen_enc = vector_length_encoding(this);
 9576     BasicType bt = Matcher::vector_element_basic_type(this);
 9577     int opc = this->ideal_Opcode();
 9578     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9579                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9580   %}
 9581   ins_pipe( pipe_slow );
 9582 %}
 9583 
 9584 
 9585 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9586   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9587   match(Set dst (RotateRightV (Binary dst shift) mask));
 9588   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9589   ins_encode %{
 9590     int vlen_enc = vector_length_encoding(this);
 9591     BasicType bt = Matcher::vector_element_basic_type(this);
 9592     int opc = this->ideal_Opcode();
 9593     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9594                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9595   %}
 9596   ins_pipe( pipe_slow );
 9597 %}
 9598 
 9599 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9600   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9601   match(Set dst (RotateRightV (Binary dst src2) mask));
 9602   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9603   ins_encode %{
 9604     int vlen_enc = vector_length_encoding(this);
 9605     BasicType bt = Matcher::vector_element_basic_type(this);
 9606     int opc = this->ideal_Opcode();
 9607     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9608                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9609   %}
 9610   ins_pipe( pipe_slow );
 9611 %}
 9612 
 9613 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9614   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9615   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9616   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9617   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9618   ins_encode %{
 9619     int vlen_enc = vector_length_encoding(this);
 9620     BasicType bt = Matcher::vector_element_basic_type(this);
 9621     int opc = this->ideal_Opcode();
 9622     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9623                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9624   %}
 9625   ins_pipe( pipe_slow );
 9626 %}
 9627 
 9628 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9629   predicate(!n->as_ShiftV()->is_var_shift());
 9630   match(Set dst (LShiftVS (Binary dst src2) mask));
 9631   match(Set dst (LShiftVI (Binary dst src2) mask));
 9632   match(Set dst (LShiftVL (Binary dst src2) mask));
 9633   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9634   ins_encode %{
 9635     int vlen_enc = vector_length_encoding(this);
 9636     BasicType bt = Matcher::vector_element_basic_type(this);
 9637     int opc = this->ideal_Opcode();
 9638     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9639                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9640   %}
 9641   ins_pipe( pipe_slow );
 9642 %}
 9643 
 9644 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9645   predicate(n->as_ShiftV()->is_var_shift());
 9646   match(Set dst (LShiftVS (Binary dst src2) mask));
 9647   match(Set dst (LShiftVI (Binary dst src2) mask));
 9648   match(Set dst (LShiftVL (Binary dst src2) mask));
 9649   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9650   ins_encode %{
 9651     int vlen_enc = vector_length_encoding(this);
 9652     BasicType bt = Matcher::vector_element_basic_type(this);
 9653     int opc = this->ideal_Opcode();
 9654     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9655                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9656   %}
 9657   ins_pipe( pipe_slow );
 9658 %}
 9659 
 9660 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9661   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9662   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9663   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9664   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9665   ins_encode %{
 9666     int vlen_enc = vector_length_encoding(this);
 9667     BasicType bt = Matcher::vector_element_basic_type(this);
 9668     int opc = this->ideal_Opcode();
 9669     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9670                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9671   %}
 9672   ins_pipe( pipe_slow );
 9673 %}
 9674 
 9675 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9676   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9677   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9678   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9679   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9680   ins_encode %{
 9681     int vlen_enc = vector_length_encoding(this);
 9682     BasicType bt = Matcher::vector_element_basic_type(this);
 9683     int opc = this->ideal_Opcode();
 9684     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9685                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9686   %}
 9687   ins_pipe( pipe_slow );
 9688 %}
 9689 
 9690 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9691   predicate(!n->as_ShiftV()->is_var_shift());
 9692   match(Set dst (RShiftVS (Binary dst src2) mask));
 9693   match(Set dst (RShiftVI (Binary dst src2) mask));
 9694   match(Set dst (RShiftVL (Binary dst src2) mask));
 9695   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9696   ins_encode %{
 9697     int vlen_enc = vector_length_encoding(this);
 9698     BasicType bt = Matcher::vector_element_basic_type(this);
 9699     int opc = this->ideal_Opcode();
 9700     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9701                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9702   %}
 9703   ins_pipe( pipe_slow );
 9704 %}
 9705 
 9706 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9707   predicate(n->as_ShiftV()->is_var_shift());
 9708   match(Set dst (RShiftVS (Binary dst src2) mask));
 9709   match(Set dst (RShiftVI (Binary dst src2) mask));
 9710   match(Set dst (RShiftVL (Binary dst src2) mask));
 9711   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9712   ins_encode %{
 9713     int vlen_enc = vector_length_encoding(this);
 9714     BasicType bt = Matcher::vector_element_basic_type(this);
 9715     int opc = this->ideal_Opcode();
 9716     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9717                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9718   %}
 9719   ins_pipe( pipe_slow );
 9720 %}
 9721 
 9722 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9723   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9724   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9725   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9726   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9727   ins_encode %{
 9728     int vlen_enc = vector_length_encoding(this);
 9729     BasicType bt = Matcher::vector_element_basic_type(this);
 9730     int opc = this->ideal_Opcode();
 9731     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9732                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9733   %}
 9734   ins_pipe( pipe_slow );
 9735 %}
 9736 
 9737 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9738   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9739   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9740   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9741   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9742   ins_encode %{
 9743     int vlen_enc = vector_length_encoding(this);
 9744     BasicType bt = Matcher::vector_element_basic_type(this);
 9745     int opc = this->ideal_Opcode();
 9746     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9747                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9748   %}
 9749   ins_pipe( pipe_slow );
 9750 %}
 9751 
 9752 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9753   predicate(!n->as_ShiftV()->is_var_shift());
 9754   match(Set dst (URShiftVS (Binary dst src2) mask));
 9755   match(Set dst (URShiftVI (Binary dst src2) mask));
 9756   match(Set dst (URShiftVL (Binary dst src2) mask));
 9757   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9758   ins_encode %{
 9759     int vlen_enc = vector_length_encoding(this);
 9760     BasicType bt = Matcher::vector_element_basic_type(this);
 9761     int opc = this->ideal_Opcode();
 9762     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9763                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9764   %}
 9765   ins_pipe( pipe_slow );
 9766 %}
 9767 
 9768 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9769   predicate(n->as_ShiftV()->is_var_shift());
 9770   match(Set dst (URShiftVS (Binary dst src2) mask));
 9771   match(Set dst (URShiftVI (Binary dst src2) mask));
 9772   match(Set dst (URShiftVL (Binary dst src2) mask));
 9773   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9774   ins_encode %{
 9775     int vlen_enc = vector_length_encoding(this);
 9776     BasicType bt = Matcher::vector_element_basic_type(this);
 9777     int opc = this->ideal_Opcode();
 9778     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9779                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9780   %}
 9781   ins_pipe( pipe_slow );
 9782 %}
 9783 
 9784 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9785   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9786   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9787   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9788   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9789   ins_encode %{
 9790     int vlen_enc = vector_length_encoding(this);
 9791     BasicType bt = Matcher::vector_element_basic_type(this);
 9792     int opc = this->ideal_Opcode();
 9793     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9794                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9795   %}
 9796   ins_pipe( pipe_slow );
 9797 %}
 9798 
 9799 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9800   match(Set dst (MaxV (Binary dst src2) mask));
 9801   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9802   ins_encode %{
 9803     int vlen_enc = vector_length_encoding(this);
 9804     BasicType bt = Matcher::vector_element_basic_type(this);
 9805     int opc = this->ideal_Opcode();
 9806     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9807                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9808   %}
 9809   ins_pipe( pipe_slow );
 9810 %}
 9811 
 9812 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9813   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9814   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9815   ins_encode %{
 9816     int vlen_enc = vector_length_encoding(this);
 9817     BasicType bt = Matcher::vector_element_basic_type(this);
 9818     int opc = this->ideal_Opcode();
 9819     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9820                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9821   %}
 9822   ins_pipe( pipe_slow );
 9823 %}
 9824 
 9825 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9826   match(Set dst (MinV (Binary dst src2) mask));
 9827   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9828   ins_encode %{
 9829     int vlen_enc = vector_length_encoding(this);
 9830     BasicType bt = Matcher::vector_element_basic_type(this);
 9831     int opc = this->ideal_Opcode();
 9832     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9833                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9834   %}
 9835   ins_pipe( pipe_slow );
 9836 %}
 9837 
 9838 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9839   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9840   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9841   ins_encode %{
 9842     int vlen_enc = vector_length_encoding(this);
 9843     BasicType bt = Matcher::vector_element_basic_type(this);
 9844     int opc = this->ideal_Opcode();
 9845     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9846                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9847   %}
 9848   ins_pipe( pipe_slow );
 9849 %}
 9850 
 9851 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9852   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9853   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9854   ins_encode %{
 9855     int vlen_enc = vector_length_encoding(this);
 9856     BasicType bt = Matcher::vector_element_basic_type(this);
 9857     int opc = this->ideal_Opcode();
 9858     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9859                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9860   %}
 9861   ins_pipe( pipe_slow );
 9862 %}
 9863 
 9864 instruct vabs_masked(vec dst, kReg mask) %{
 9865   match(Set dst (AbsVB dst mask));
 9866   match(Set dst (AbsVS dst mask));
 9867   match(Set dst (AbsVI dst mask));
 9868   match(Set dst (AbsVL dst mask));
 9869   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9870   ins_encode %{
 9871     int vlen_enc = vector_length_encoding(this);
 9872     BasicType bt = Matcher::vector_element_basic_type(this);
 9873     int opc = this->ideal_Opcode();
 9874     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9875                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9876   %}
 9877   ins_pipe( pipe_slow );
 9878 %}
 9879 
 9880 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9881   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9882   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9883   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9884   ins_encode %{
 9885     assert(UseFMA, "Needs FMA instructions support.");
 9886     int vlen_enc = vector_length_encoding(this);
 9887     BasicType bt = Matcher::vector_element_basic_type(this);
 9888     int opc = this->ideal_Opcode();
 9889     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9890                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9891   %}
 9892   ins_pipe( pipe_slow );
 9893 %}
 9894 
 9895 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9896   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9897   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9898   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9899   ins_encode %{
 9900     assert(UseFMA, "Needs FMA instructions support.");
 9901     int vlen_enc = vector_length_encoding(this);
 9902     BasicType bt = Matcher::vector_element_basic_type(this);
 9903     int opc = this->ideal_Opcode();
 9904     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9905                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9906   %}
 9907   ins_pipe( pipe_slow );
 9908 %}
 9909 
 9910 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
 9911   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
 9912   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
 9913   ins_encode %{
 9914     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 9915     int vlen_enc = vector_length_encoding(this, $src1);
 9916     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9917 
 9918     // Comparison i
 9919     switch (src1_elem_bt) {
 9920       case T_BYTE: {
 9921         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9922         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9923         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9924         break;
 9925       }
 9926       case T_SHORT: {
 9927         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9928         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9929         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9930         break;
 9931       }
 9932       case T_INT: {
 9933         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9934         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9935         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9936         break;
 9937       }
 9938       case T_LONG: {
 9939         bool is_unsigned = Matcher::is_unsigned_booltest_pred($cond$$constant);
 9940         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9941         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9942         break;
 9943       }
 9944       case T_FLOAT: {
 9945         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9946         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9947         break;
 9948       }
 9949       case T_DOUBLE: {
 9950         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9951         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9952         break;
 9953       }
 9954       default: assert(false, "%s", type2name(src1_elem_bt)); break;
 9955     }
 9956   %}
 9957   ins_pipe( pipe_slow );
 9958 %}
 9959 
 9960 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
 9961   predicate(Matcher::vector_length(n) <= 32);
 9962   match(Set dst (MaskAll src));
 9963   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
 9964   ins_encode %{
 9965     int mask_len = Matcher::vector_length(this);
 9966     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
 9967   %}
 9968   ins_pipe( pipe_slow );
 9969 %}
 9970 
 9971 #ifdef _LP64
 9972 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
 9973   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
 9974   match(Set dst (XorVMask src (MaskAll cnt)));
 9975   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
 9976   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
 9977   ins_encode %{
 9978     uint masklen = Matcher::vector_length(this);
 9979     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
 9980   %}
 9981   ins_pipe( pipe_slow );
 9982 %}
 9983 
 9984 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
 9985   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
 9986             (Matcher::vector_length(n) == 16) ||
 9987             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
 9988   match(Set dst (XorVMask src (MaskAll cnt)));
 9989   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
 9990   ins_encode %{
 9991     uint masklen = Matcher::vector_length(this);
 9992     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
 9993   %}
 9994   ins_pipe( pipe_slow );
 9995 %}
 9996 
 9997 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
 9998   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
 9999   match(Set dst (VectorLongToMask src));
10000   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10001   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10002   ins_encode %{
10003     int mask_len = Matcher::vector_length(this);
10004     int vec_enc  = vector_length_encoding(mask_len);
10005     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10006                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10007   %}
10008   ins_pipe( pipe_slow );
10009 %}
10010 
10011 
10012 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10013   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
10014   match(Set dst (VectorLongToMask src));
10015   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10016   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10017   ins_encode %{
10018     int mask_len = Matcher::vector_length(this);
10019     assert(mask_len <= 32, "invalid mask length");
10020     int vec_enc  = vector_length_encoding(mask_len);
10021     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10022                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10023   %}
10024   ins_pipe( pipe_slow );
10025 %}
10026 
10027 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10028   predicate(n->bottom_type()->isa_vectmask());
10029   match(Set dst (VectorLongToMask src));
10030   format %{ "long_to_mask_evex $dst, $src\t!" %}
10031   ins_encode %{
10032     __ kmov($dst$$KRegister, $src$$Register);
10033   %}
10034   ins_pipe( pipe_slow );
10035 %}
10036 #endif
10037 
10038 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10039   match(Set dst (AndVMask src1 src2));
10040   match(Set dst (OrVMask src1 src2));
10041   match(Set dst (XorVMask src1 src2));
10042   effect(TEMP kscratch);
10043   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10044   ins_encode %{
10045     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10046     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10047     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10048     uint masklen = Matcher::vector_length(this);
10049     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10050     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10051   %}
10052   ins_pipe( pipe_slow );
10053 %}
10054 
10055 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10056   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10057   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10058   ins_encode %{
10059     int vlen_enc = vector_length_encoding(this);
10060     BasicType bt = Matcher::vector_element_basic_type(this);
10061     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10062                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10063   %}
10064   ins_pipe( pipe_slow );
10065 %}
10066 
10067 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10068   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10069   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10070   ins_encode %{
10071     int vlen_enc = vector_length_encoding(this);
10072     BasicType bt = Matcher::vector_element_basic_type(this);
10073     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10074                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10075   %}
10076   ins_pipe( pipe_slow );
10077 %}
10078 
10079 instruct castMM(kReg dst)
10080 %{
10081   match(Set dst (CastVV dst));
10082 
10083   size(0);
10084   format %{ "# castVV of $dst" %}
10085   ins_encode(/* empty encoding */);
10086   ins_cost(0);
10087   ins_pipe(empty);
10088 %}
10089 
10090 instruct castVV(vec dst)
10091 %{
10092   match(Set dst (CastVV dst));
10093 
10094   size(0);
10095   format %{ "# castVV of $dst" %}
10096   ins_encode(/* empty encoding */);
10097   ins_cost(0);
10098   ins_pipe(empty);
10099 %}
10100 
10101 instruct castVVLeg(legVec dst)
10102 %{
10103   match(Set dst (CastVV dst));
10104 
10105   size(0);
10106   format %{ "# castVV of $dst" %}
10107   ins_encode(/* empty encoding */);
10108   ins_cost(0);
10109   ins_pipe(empty);
10110 %}
10111 
10112 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10113 %{
10114   match(Set dst (IsInfiniteF src));
10115   effect(TEMP ktmp, KILL cr);
10116   format %{ "float_class_check $dst, $src" %}
10117   ins_encode %{
10118     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10119     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10120   %}
10121   ins_pipe(pipe_slow);
10122 %}
10123 
10124 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10125 %{
10126   match(Set dst (IsInfiniteD src));
10127   effect(TEMP ktmp, KILL cr);
10128   format %{ "double_class_check $dst, $src" %}
10129   ins_encode %{
10130     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10131     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10132   %}
10133   ins_pipe(pipe_slow);
10134 %}