1 //
    2 // Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM31 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_unsigned_booltest_pred(int bt) {
 1250   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
 1251 }
 1252 
 1253 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1254   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1255            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1256 }
 1257 
 1258 class Node::PD {
 1259 public:
 1260   enum NodeFlags {
 1261     Flag_intel_jcc_erratum = Node::_last_flag << 1,
 1262     _last_flag             = Flag_intel_jcc_erratum
 1263   };
 1264 };
 1265 
 1266 %} // end source_hpp
 1267 
 1268 source %{
 1269 
 1270 #include "opto/addnode.hpp"
 1271 #include "c2_intelJccErratum_x86.hpp"
 1272 
 1273 void PhaseOutput::pd_perform_mach_node_analysis() {
 1274   if (VM_Version::has_intel_jcc_erratum()) {
 1275     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1276     _buf_sizes._code += extra_padding;
 1277   }
 1278 }
 1279 
 1280 int MachNode::pd_alignment_required() const {
 1281   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1282     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1283     return IntelJccErratum::largest_jcc_size() + 1;
 1284   } else {
 1285     return 1;
 1286   }
 1287 }
 1288 
 1289 int MachNode::compute_padding(int current_offset) const {
 1290   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1291     Compile* C = Compile::current();
 1292     PhaseOutput* output = C->output();
 1293     Block* block = output->block();
 1294     int index = output->index();
 1295     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1296   } else {
 1297     return 0;
 1298   }
 1299 }
 1300 
 1301 // Emit exception handler code.
 1302 // Stuff framesize into a register and call a VM stub routine.
 1303 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1304 
 1305   // Note that the code buffer's insts_mark is always relative to insts.
 1306   // That's why we must use the macroassembler to generate a handler.
 1307   C2_MacroAssembler _masm(&cbuf);
 1308   address base = __ start_a_stub(size_exception_handler());
 1309   if (base == NULL) {
 1310     ciEnv::current()->record_failure("CodeCache is full");
 1311     return 0;  // CodeBuffer::expand failed
 1312   }
 1313   int offset = __ offset();
 1314   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1315   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1316   __ end_a_stub();
 1317   return offset;
 1318 }
 1319 
 1320 // Emit deopt handler code.
 1321 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1322 
 1323   // Note that the code buffer's insts_mark is always relative to insts.
 1324   // That's why we must use the macroassembler to generate a handler.
 1325   C2_MacroAssembler _masm(&cbuf);
 1326   address base = __ start_a_stub(size_deopt_handler());
 1327   if (base == NULL) {
 1328     ciEnv::current()->record_failure("CodeCache is full");
 1329     return 0;  // CodeBuffer::expand failed
 1330   }
 1331   int offset = __ offset();
 1332 
 1333 #ifdef _LP64
 1334   address the_pc = (address) __ pc();
 1335   Label next;
 1336   // push a "the_pc" on the stack without destroying any registers
 1337   // as they all may be live.
 1338 
 1339   // push address of "next"
 1340   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1341   __ bind(next);
 1342   // adjust it so it matches "the_pc"
 1343   __ subptr(Address(rsp, 0), __ offset() - offset);
 1344 #else
 1345   InternalAddress here(__ pc());
 1346   __ pushptr(here.addr(), noreg);
 1347 #endif
 1348 
 1349   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1350   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1351   __ end_a_stub();
 1352   return offset;
 1353 }
 1354 
 1355 Assembler::Width widthForType(BasicType bt) {
 1356   if (bt == T_BYTE) {
 1357     return Assembler::B;
 1358   } else if (bt == T_SHORT) {
 1359     return Assembler::W;
 1360   } else if (bt == T_INT) {
 1361     return Assembler::D;
 1362   } else {
 1363     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1364     return Assembler::Q;
 1365   }
 1366 }
 1367 
 1368 //=============================================================================
 1369 
 1370   // Float masks come from different places depending on platform.
 1371 #ifdef _LP64
 1372   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1373   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1374   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1375   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1376 #else
 1377   static address float_signmask()  { return (address)float_signmask_pool; }
 1378   static address float_signflip()  { return (address)float_signflip_pool; }
 1379   static address double_signmask() { return (address)double_signmask_pool; }
 1380   static address double_signflip() { return (address)double_signflip_pool; }
 1381 #endif
 1382   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1383   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1384   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1385   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1386   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1387   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1388   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1389   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1390   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1391   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1392   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1393   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1394   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1395   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1396   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1397 
 1398 //=============================================================================
 1399 const bool Matcher::match_rule_supported(int opcode) {
 1400   if (!has_match_rule(opcode)) {
 1401     return false; // no match rule present
 1402   }
 1403   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1404   switch (opcode) {
 1405     case Op_AbsVL:
 1406     case Op_StoreVectorScatter:
 1407       if (UseAVX < 3) {
 1408         return false;
 1409       }
 1410       break;
 1411     case Op_PopCountI:
 1412     case Op_PopCountL:
 1413       if (!UsePopCountInstruction) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountVI:
 1418       if (UseAVX < 2) {
 1419         return false;
 1420       }
 1421       break;
 1422     case Op_PopCountVL:
 1423       if (UseAVX < 2) {
 1424         return false;
 1425       }
 1426       break;
 1427     case Op_MulVI:
 1428       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1429         return false;
 1430       }
 1431       break;
 1432     case Op_MulVL:
 1433       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1434         return false;
 1435       }
 1436       break;
 1437     case Op_MulReductionVL:
 1438       if (VM_Version::supports_avx512dq() == false) {
 1439         return false;
 1440       }
 1441       break;
 1442     case Op_AddReductionVL:
 1443       if (UseSSE < 2) { // requires at least SSE2
 1444         return false;
 1445       }
 1446       break;
 1447     case Op_AbsVB:
 1448     case Op_AbsVS:
 1449     case Op_AbsVI:
 1450     case Op_AddReductionVI:
 1451     case Op_AndReductionV:
 1452     case Op_OrReductionV:
 1453     case Op_XorReductionV:
 1454       if (UseSSE < 3) { // requires at least SSSE3
 1455         return false;
 1456       }
 1457       break;
 1458     case Op_VectorLoadShuffle:
 1459     case Op_VectorRearrange:
 1460     case Op_MulReductionVI:
 1461       if (UseSSE < 4) { // requires at least SSE4
 1462         return false;
 1463       }
 1464       break;
 1465     case Op_IsInfiniteF:
 1466     case Op_IsInfiniteD:
 1467       if (!VM_Version::supports_avx512dq()) {
 1468         return false;
 1469       }
 1470       break;
 1471     case Op_SqrtVD:
 1472     case Op_SqrtVF:
 1473     case Op_VectorMaskCmp:
 1474     case Op_VectorCastB2X:
 1475     case Op_VectorCastS2X:
 1476     case Op_VectorCastI2X:
 1477     case Op_VectorCastL2X:
 1478     case Op_VectorCastF2X:
 1479     case Op_VectorCastD2X:
 1480     case Op_VectorUCastB2X:
 1481     case Op_VectorUCastS2X:
 1482     case Op_VectorUCastI2X:
 1483     case Op_VectorMaskCast:
 1484       if (UseAVX < 1) { // enabled for AVX only
 1485         return false;
 1486       }
 1487       break;
 1488     case Op_PopulateIndex:
 1489       if (!is_LP64 || (UseAVX < 2)) {
 1490         return false;
 1491       }
 1492       break;
 1493     case Op_RoundVF:
 1494       if (UseAVX < 2) { // enabled for AVX2 only
 1495         return false;
 1496       }
 1497       break;
 1498     case Op_RoundVD:
 1499       if (UseAVX < 3) {
 1500         return false;  // enabled for AVX3 only
 1501       }
 1502       break;
 1503     case Op_CompareAndSwapL:
 1504 #ifdef _LP64
 1505     case Op_CompareAndSwapP:
 1506 #endif
 1507       if (!VM_Version::supports_cx8()) {
 1508         return false;
 1509       }
 1510       break;
 1511     case Op_CMoveVF:
 1512     case Op_CMoveVD:
 1513       if (UseAVX < 1) { // enabled for AVX only
 1514         return false;
 1515       }
 1516       break;
 1517     case Op_StrIndexOf:
 1518       if (!UseSSE42Intrinsics) {
 1519         return false;
 1520       }
 1521       break;
 1522     case Op_StrIndexOfChar:
 1523       if (!UseSSE42Intrinsics) {
 1524         return false;
 1525       }
 1526       break;
 1527     case Op_OnSpinWait:
 1528       if (VM_Version::supports_on_spin_wait() == false) {
 1529         return false;
 1530       }
 1531       break;
 1532     case Op_MulVB:
 1533     case Op_LShiftVB:
 1534     case Op_RShiftVB:
 1535     case Op_URShiftVB:
 1536     case Op_VectorInsert:
 1537     case Op_VectorLoadMask:
 1538     case Op_VectorStoreMask:
 1539     case Op_VectorBlend:
 1540       if (UseSSE < 4) {
 1541         return false;
 1542       }
 1543       break;
 1544 #ifdef _LP64
 1545     case Op_MaxD:
 1546     case Op_MaxF:
 1547     case Op_MinD:
 1548     case Op_MinF:
 1549       if (UseAVX < 1) { // enabled for AVX only
 1550         return false;
 1551       }
 1552       break;
 1553 #endif
 1554     case Op_CacheWB:
 1555     case Op_CacheWBPreSync:
 1556     case Op_CacheWBPostSync:
 1557       if (!VM_Version::supports_data_cache_line_flush()) {
 1558         return false;
 1559       }
 1560       break;
 1561     case Op_ExtractB:
 1562     case Op_ExtractL:
 1563     case Op_ExtractI:
 1564     case Op_RoundDoubleMode:
 1565       if (UseSSE < 4) {
 1566         return false;
 1567       }
 1568       break;
 1569     case Op_RoundDoubleModeV:
 1570       if (VM_Version::supports_avx() == false) {
 1571         return false; // 128bit vroundpd is not available
 1572       }
 1573       break;
 1574     case Op_LoadVectorGather:
 1575       if (UseAVX < 2) {
 1576         return false;
 1577       }
 1578       break;
 1579     case Op_FmaVD:
 1580     case Op_FmaVF:
 1581       if (!UseFMA) {
 1582         return false;
 1583       }
 1584       break;
 1585     case Op_MacroLogicV:
 1586       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1587         return false;
 1588       }
 1589       break;
 1590 
 1591     case Op_VectorCmpMasked:
 1592     case Op_VectorMaskGen:
 1593       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1594         return false;
 1595       }
 1596       break;
 1597     case Op_VectorMaskFirstTrue:
 1598     case Op_VectorMaskLastTrue:
 1599     case Op_VectorMaskTrueCount:
 1600     case Op_VectorMaskToLong:
 1601       if (!is_LP64 || UseAVX < 1) {
 1602          return false;
 1603       }
 1604       break;
 1605     case Op_RoundF:
 1606     case Op_RoundD:
 1607       if (!is_LP64) {
 1608         return false;
 1609       }
 1610       break;
 1611     case Op_CopySignD:
 1612     case Op_CopySignF:
 1613       if (UseAVX < 3 || !is_LP64)  {
 1614         return false;
 1615       }
 1616       if (!VM_Version::supports_avx512vl()) {
 1617         return false;
 1618       }
 1619       break;
 1620 #ifndef _LP64
 1621     case Op_AddReductionVF:
 1622     case Op_AddReductionVD:
 1623     case Op_MulReductionVF:
 1624     case Op_MulReductionVD:
 1625       if (UseSSE < 1) { // requires at least SSE
 1626         return false;
 1627       }
 1628       break;
 1629     case Op_MulAddVS2VI:
 1630     case Op_RShiftVL:
 1631     case Op_AbsVD:
 1632     case Op_NegVD:
 1633       if (UseSSE < 2) {
 1634         return false;
 1635       }
 1636       break;
 1637 #endif // !LP64
 1638     case Op_CompressBits:
 1639       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_ExpandBits:
 1644       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1645         return false;
 1646       }
 1647       break;
 1648     case Op_SignumF:
 1649       if (UseSSE < 1) {
 1650         return false;
 1651       }
 1652       break;
 1653     case Op_SignumD:
 1654       if (UseSSE < 2) {
 1655         return false;
 1656       }
 1657       break;
 1658     case Op_CompressM:
 1659       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1660         return false;
 1661       }
 1662       break;
 1663     case Op_CompressV:
 1664     case Op_ExpandV:
 1665       if (!VM_Version::supports_avx512vl()) {
 1666         return false;
 1667       }
 1668       break;
 1669     case Op_SqrtF:
 1670       if (UseSSE < 1) {
 1671         return false;
 1672       }
 1673       break;
 1674     case Op_SqrtD:
 1675 #ifdef _LP64
 1676       if (UseSSE < 2) {
 1677         return false;
 1678       }
 1679 #else
 1680       // x86_32.ad has a special match rule for SqrtD.
 1681       // Together with common x86 rules, this handles all UseSSE cases.
 1682 #endif
 1683       break;
 1684     case Op_ConvF2HF:
 1685     case Op_ConvHF2F:
 1686       if (!VM_Version::supports_f16c() && !VM_Version::supports_avx512vl()) {
 1687         return false;
 1688       }
 1689       break;
 1690     case Op_VectorCastF2HF:
 1691     case Op_VectorCastHF2F:
 1692       if (!VM_Version::supports_f16c() && !VM_Version::supports_evex()) {
 1693         return false;
 1694       }
 1695       break;
 1696   }
 1697   return true;  // Match rules are supported by default.
 1698 }
 1699 
 1700 //------------------------------------------------------------------------
 1701 
 1702 static inline bool is_pop_count_instr_target(BasicType bt) {
 1703   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1704          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1705 }
 1706 
 1707 const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) {
 1708   return match_rule_supported_vector(opcode, vlen, bt);
 1709 }
 1710 
 1711 // Identify extra cases that we might want to provide match rules for vector nodes and
 1712 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1713 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1714   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1715   if (!match_rule_supported(opcode)) {
 1716     return false;
 1717   }
 1718   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1719   //   * SSE2 supports 128bit vectors for all types;
 1720   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1721   //   * AVX2 supports 256bit vectors for all types;
 1722   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1723   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1724   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1725   // And MaxVectorSize is taken into account as well.
 1726   if (!vector_size_supported(bt, vlen)) {
 1727     return false;
 1728   }
 1729   // Special cases which require vector length follow:
 1730   //   * implementation limitations
 1731   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1732   //   * 128bit vroundpd instruction is present only in AVX1
 1733   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1734   switch (opcode) {
 1735     case Op_AbsVF:
 1736     case Op_NegVF:
 1737       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1738         return false; // 512bit vandps and vxorps are not available
 1739       }
 1740       break;
 1741     case Op_AbsVD:
 1742     case Op_NegVD:
 1743       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1744         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1745       }
 1746       break;
 1747     case Op_CMoveVF:
 1748       if (vlen != 8) {
 1749         return false; // implementation limitation (only vcmov8F_reg is present)
 1750       }
 1751       break;
 1752     case Op_RotateRightV:
 1753     case Op_RotateLeftV:
 1754       if (bt != T_INT && bt != T_LONG) {
 1755         return false;
 1756       } // fallthrough
 1757     case Op_MacroLogicV:
 1758       if (!VM_Version::supports_evex() ||
 1759           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1760         return false;
 1761       }
 1762       break;
 1763     case Op_ClearArray:
 1764     case Op_VectorMaskGen:
 1765     case Op_VectorCmpMasked:
 1766       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1767         return false;
 1768       }
 1769       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1770         return false;
 1771       }
 1772       break;
 1773     case Op_LoadVectorMasked:
 1774     case Op_StoreVectorMasked:
 1775       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1776         return false;
 1777       }
 1778       break;
 1779     case Op_CMoveVD:
 1780       if (vlen != 4) {
 1781         return false; // implementation limitation (only vcmov4D_reg is present)
 1782       }
 1783       break;
 1784     case Op_MaxV:
 1785     case Op_MinV:
 1786       if (UseSSE < 4 && is_integral_type(bt)) {
 1787         return false;
 1788       }
 1789       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1790           // Float/Double intrinsics are enabled for AVX family currently.
 1791           if (UseAVX == 0) {
 1792             return false;
 1793           }
 1794           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1795             return false;
 1796           }
 1797       }
 1798       break;
 1799     case Op_CallLeafVector:
 1800       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1801         return false;
 1802       }
 1803       break;
 1804     case Op_AddReductionVI:
 1805       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1806         return false;
 1807       }
 1808       // fallthrough
 1809     case Op_AndReductionV:
 1810     case Op_OrReductionV:
 1811     case Op_XorReductionV:
 1812       if (is_subword_type(bt) && (UseSSE < 4)) {
 1813         return false;
 1814       }
 1815 #ifndef _LP64
 1816       if (bt == T_BYTE || bt == T_LONG) {
 1817         return false;
 1818       }
 1819 #endif
 1820       break;
 1821 #ifndef _LP64
 1822     case Op_VectorInsert:
 1823       if (bt == T_LONG || bt == T_DOUBLE) {
 1824         return false;
 1825       }
 1826       break;
 1827 #endif
 1828     case Op_MinReductionV:
 1829     case Op_MaxReductionV:
 1830       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1831         return false;
 1832       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1833         return false;
 1834       }
 1835       // Float/Double intrinsics enabled for AVX family.
 1836       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1837         return false;
 1838       }
 1839       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1840         return false;
 1841       }
 1842 #ifndef _LP64
 1843       if (bt == T_BYTE || bt == T_LONG) {
 1844         return false;
 1845       }
 1846 #endif
 1847       break;
 1848     case Op_VectorTest:
 1849       if (UseSSE < 4) {
 1850         return false; // Implementation limitation
 1851       } else if (size_in_bits < 32) {
 1852         return false; // Implementation limitation
 1853       }
 1854       break;
 1855     case Op_VectorLoadShuffle:
 1856     case Op_VectorRearrange:
 1857       if(vlen == 2) {
 1858         return false; // Implementation limitation due to how shuffle is loaded
 1859       } else if (size_in_bits == 256 && UseAVX < 2) {
 1860         return false; // Implementation limitation
 1861       }
 1862       break;
 1863     case Op_VectorLoadMask:
 1864     case Op_VectorMaskCast:
 1865       if (size_in_bits == 256 && UseAVX < 2) {
 1866         return false; // Implementation limitation
 1867       }
 1868       // fallthrough
 1869     case Op_VectorStoreMask:
 1870       if (vlen == 2) {
 1871         return false; // Implementation limitation
 1872       }
 1873       break;
 1874     case Op_PopulateIndex:
 1875       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1876         return false;
 1877       }
 1878       break;
 1879     case Op_VectorCastB2X:
 1880     case Op_VectorCastS2X:
 1881     case Op_VectorCastI2X:
 1882       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1883         return false;
 1884       }
 1885       break;
 1886     case Op_VectorCastL2X:
 1887       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1888         return false;
 1889       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1890         return false;
 1891       }
 1892       break;
 1893     case Op_VectorCastF2X: {
 1894         // As per JLS section 5.1.3 narrowing conversion to sub-word types
 1895         // happen after intermediate conversion to integer and special handling
 1896         // code needs AVX2 vpcmpeqd instruction for 256 bit vectors.
 1897         int src_size_in_bits = type2aelembytes(T_FLOAT) * vlen * BitsPerByte;
 1898         if (is_integral_type(bt) && src_size_in_bits == 256 && UseAVX < 2) {
 1899           return false;
 1900         }
 1901       }
 1902       // fallthrough
 1903     case Op_VectorCastD2X:
 1904       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1905         return false;
 1906       }
 1907       break;
 1908     case Op_VectorCastF2HF:
 1909     case Op_VectorCastHF2F:
 1910       if (!VM_Version::supports_f16c() &&
 1911          ((!VM_Version::supports_evex() ||
 1912          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())))) {
 1913         return false;
 1914       }
 1915       break;
 1916     case Op_RoundVD:
 1917       if (!VM_Version::supports_avx512dq()) {
 1918         return false;
 1919       }
 1920       break;
 1921     case Op_MulReductionVI:
 1922       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1923         return false;
 1924       }
 1925       break;
 1926     case Op_LoadVectorGatherMasked:
 1927     case Op_StoreVectorScatterMasked:
 1928     case Op_StoreVectorScatter:
 1929       if (is_subword_type(bt)) {
 1930         return false;
 1931       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1932         return false;
 1933       }
 1934       // fallthrough
 1935     case Op_LoadVectorGather:
 1936       if (size_in_bits == 64 ) {
 1937         return false;
 1938       }
 1939       break;
 1940     case Op_MaskAll:
 1941       if (!VM_Version::supports_evex()) {
 1942         return false;
 1943       }
 1944       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1945         return false;
 1946       }
 1947       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1948         return false;
 1949       }
 1950       break;
 1951     case Op_VectorMaskCmp:
 1952       if (vlen < 2 || size_in_bits < 32) {
 1953         return false;
 1954       }
 1955       break;
 1956     case Op_CompressM:
 1957       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1958         return false;
 1959       }
 1960       break;
 1961     case Op_CompressV:
 1962     case Op_ExpandV:
 1963       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1964         return false;
 1965       }
 1966       if (size_in_bits < 128 ) {
 1967         return false;
 1968       }
 1969       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1970         return false;
 1971       }
 1972       break;
 1973     case Op_VectorLongToMask:
 1974       if (UseAVX < 1 || !is_LP64) {
 1975         return false;
 1976       }
 1977       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1978         return false;
 1979       }
 1980       break;
 1981     case Op_SignumVD:
 1982     case Op_SignumVF:
 1983       if (UseAVX < 1) {
 1984         return false;
 1985       }
 1986       break;
 1987     case Op_PopCountVI:
 1988     case Op_PopCountVL: {
 1989         if (!is_pop_count_instr_target(bt) &&
 1990             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1991           return false;
 1992         }
 1993       }
 1994       break;
 1995     case Op_ReverseV:
 1996     case Op_ReverseBytesV:
 1997       if (UseAVX < 2) {
 1998         return false;
 1999       }
 2000       break;
 2001     case Op_CountTrailingZerosV:
 2002     case Op_CountLeadingZerosV:
 2003       if (UseAVX < 2) {
 2004         return false;
 2005       }
 2006       break;
 2007   }
 2008   return true;  // Per default match rules are supported.
 2009 }
 2010 
 2011 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2012   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2013   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2014   // of their non-masked counterpart with mask edge being the differentiator.
 2015   // This routine does a strict check on the existence of masked operation patterns
 2016   // by returning a default false value for all the other opcodes apart from the
 2017   // ones whose masked instruction patterns are defined in this file.
 2018   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2019     return false;
 2020   }
 2021 
 2022   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2023   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2024   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2025     return false;
 2026   }
 2027   switch(opcode) {
 2028     // Unary masked operations
 2029     case Op_AbsVB:
 2030     case Op_AbsVS:
 2031       if(!VM_Version::supports_avx512bw()) {
 2032         return false;  // Implementation limitation
 2033       }
 2034     case Op_AbsVI:
 2035     case Op_AbsVL:
 2036       return true;
 2037 
 2038     // Ternary masked operations
 2039     case Op_FmaVF:
 2040     case Op_FmaVD:
 2041       return true;
 2042 
 2043     case Op_MacroLogicV:
 2044       if(bt != T_INT && bt != T_LONG) {
 2045         return false;
 2046       }
 2047       return true;
 2048 
 2049     // Binary masked operations
 2050     case Op_AddVB:
 2051     case Op_AddVS:
 2052     case Op_SubVB:
 2053     case Op_SubVS:
 2054     case Op_MulVS:
 2055     case Op_LShiftVS:
 2056     case Op_RShiftVS:
 2057     case Op_URShiftVS:
 2058       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2059       if (!VM_Version::supports_avx512bw()) {
 2060         return false;  // Implementation limitation
 2061       }
 2062       return true;
 2063 
 2064     case Op_MulVL:
 2065       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2066       if (!VM_Version::supports_avx512dq()) {
 2067         return false;  // Implementation limitation
 2068       }
 2069       return true;
 2070 
 2071     case Op_AndV:
 2072     case Op_OrV:
 2073     case Op_XorV:
 2074     case Op_RotateRightV:
 2075     case Op_RotateLeftV:
 2076       if (bt != T_INT && bt != T_LONG) {
 2077         return false; // Implementation limitation
 2078       }
 2079       return true;
 2080 
 2081     case Op_VectorLoadMask:
 2082       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2083       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2084         return false;
 2085       }
 2086       return true;
 2087 
 2088     case Op_AddVI:
 2089     case Op_AddVL:
 2090     case Op_AddVF:
 2091     case Op_AddVD:
 2092     case Op_SubVI:
 2093     case Op_SubVL:
 2094     case Op_SubVF:
 2095     case Op_SubVD:
 2096     case Op_MulVI:
 2097     case Op_MulVF:
 2098     case Op_MulVD:
 2099     case Op_DivVF:
 2100     case Op_DivVD:
 2101     case Op_SqrtVF:
 2102     case Op_SqrtVD:
 2103     case Op_LShiftVI:
 2104     case Op_LShiftVL:
 2105     case Op_RShiftVI:
 2106     case Op_RShiftVL:
 2107     case Op_URShiftVI:
 2108     case Op_URShiftVL:
 2109     case Op_LoadVectorMasked:
 2110     case Op_StoreVectorMasked:
 2111     case Op_LoadVectorGatherMasked:
 2112     case Op_StoreVectorScatterMasked:
 2113       return true;
 2114 
 2115     case Op_MaxV:
 2116     case Op_MinV:
 2117       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2118         return false; // Implementation limitation
 2119       }
 2120       if (is_floating_point_type(bt)) {
 2121         return false; // Implementation limitation
 2122       }
 2123       return true;
 2124 
 2125     case Op_VectorMaskCmp:
 2126       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2127         return false; // Implementation limitation
 2128       }
 2129       return true;
 2130 
 2131     case Op_VectorRearrange:
 2132       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2133         return false; // Implementation limitation
 2134       }
 2135       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2136         return false; // Implementation limitation
 2137       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2138         return false; // Implementation limitation
 2139       }
 2140       return true;
 2141 
 2142     // Binary Logical operations
 2143     case Op_AndVMask:
 2144     case Op_OrVMask:
 2145     case Op_XorVMask:
 2146       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2147         return false; // Implementation limitation
 2148       }
 2149       return true;
 2150 
 2151     case Op_PopCountVI:
 2152     case Op_PopCountVL:
 2153       if (!is_pop_count_instr_target(bt)) {
 2154         return false;
 2155       }
 2156       return true;
 2157 
 2158     case Op_MaskAll:
 2159       return true;
 2160 
 2161     case Op_CountLeadingZerosV:
 2162       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2163         return true;
 2164       }
 2165     default:
 2166       return false;
 2167   }
 2168 }
 2169 
 2170 const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2171   return false;
 2172 }
 2173 
 2174 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2175   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2176   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2177   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2178       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2179     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2180     return new legVecZOper();
 2181   }
 2182   if (legacy) {
 2183     switch (ideal_reg) {
 2184       case Op_VecS: return new legVecSOper();
 2185       case Op_VecD: return new legVecDOper();
 2186       case Op_VecX: return new legVecXOper();
 2187       case Op_VecY: return new legVecYOper();
 2188       case Op_VecZ: return new legVecZOper();
 2189     }
 2190   } else {
 2191     switch (ideal_reg) {
 2192       case Op_VecS: return new vecSOper();
 2193       case Op_VecD: return new vecDOper();
 2194       case Op_VecX: return new vecXOper();
 2195       case Op_VecY: return new vecYOper();
 2196       case Op_VecZ: return new vecZOper();
 2197     }
 2198   }
 2199   ShouldNotReachHere();
 2200   return NULL;
 2201 }
 2202 
 2203 bool Matcher::is_reg2reg_move(MachNode* m) {
 2204   switch (m->rule()) {
 2205     case MoveVec2Leg_rule:
 2206     case MoveLeg2Vec_rule:
 2207     case MoveF2VL_rule:
 2208     case MoveF2LEG_rule:
 2209     case MoveVL2F_rule:
 2210     case MoveLEG2F_rule:
 2211     case MoveD2VL_rule:
 2212     case MoveD2LEG_rule:
 2213     case MoveVL2D_rule:
 2214     case MoveLEG2D_rule:
 2215       return true;
 2216     default:
 2217       return false;
 2218   }
 2219 }
 2220 
 2221 bool Matcher::is_generic_vector(MachOper* opnd) {
 2222   switch (opnd->opcode()) {
 2223     case VEC:
 2224     case LEGVEC:
 2225       return true;
 2226     default:
 2227       return false;
 2228   }
 2229 }
 2230 
 2231 //------------------------------------------------------------------------
 2232 
 2233 const RegMask* Matcher::predicate_reg_mask(void) {
 2234   return &_VECTMASK_REG_mask;
 2235 }
 2236 
 2237 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2238   return new TypeVectMask(elemTy, length);
 2239 }
 2240 
 2241 // Max vector size in bytes. 0 if not supported.
 2242 const int Matcher::vector_width_in_bytes(BasicType bt) {
 2243   assert(is_java_primitive(bt), "only primitive type vectors");
 2244   if (UseSSE < 2) return 0;
 2245   // SSE2 supports 128bit vectors for all types.
 2246   // AVX2 supports 256bit vectors for all types.
 2247   // AVX2/EVEX supports 512bit vectors for all types.
 2248   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2249   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2250   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2251     size = (UseAVX > 2) ? 64 : 32;
 2252   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2253     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2254   // Use flag to limit vector size.
 2255   size = MIN2(size,(int)MaxVectorSize);
 2256   // Minimum 2 values in vector (or 4 for bytes).
 2257   switch (bt) {
 2258   case T_DOUBLE:
 2259   case T_LONG:
 2260     if (size < 16) return 0;
 2261     break;
 2262   case T_FLOAT:
 2263   case T_INT:
 2264     if (size < 8) return 0;
 2265     break;
 2266   case T_BOOLEAN:
 2267     if (size < 4) return 0;
 2268     break;
 2269   case T_CHAR:
 2270     if (size < 4) return 0;
 2271     break;
 2272   case T_BYTE:
 2273     if (size < 4) return 0;
 2274     break;
 2275   case T_SHORT:
 2276     if (size < 4) return 0;
 2277     break;
 2278   default:
 2279     ShouldNotReachHere();
 2280   }
 2281   return size;
 2282 }
 2283 
 2284 // Limits on vector size (number of elements) loaded into vector.
 2285 const int Matcher::max_vector_size(const BasicType bt) {
 2286   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2287 }
 2288 const int Matcher::min_vector_size(const BasicType bt) {
 2289   int max_size = max_vector_size(bt);
 2290   // Min size which can be loaded into vector is 4 bytes.
 2291   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2292   // Support for calling svml double64 vectors
 2293   if (bt == T_DOUBLE) {
 2294     size = 1;
 2295   }
 2296   return MIN2(size,max_size);
 2297 }
 2298 
 2299 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2300   return -1;
 2301 }
 2302 
 2303 // Vector ideal reg corresponding to specified size in bytes
 2304 const uint Matcher::vector_ideal_reg(int size) {
 2305   assert(MaxVectorSize >= size, "");
 2306   switch(size) {
 2307     case  4: return Op_VecS;
 2308     case  8: return Op_VecD;
 2309     case 16: return Op_VecX;
 2310     case 32: return Op_VecY;
 2311     case 64: return Op_VecZ;
 2312   }
 2313   ShouldNotReachHere();
 2314   return 0;
 2315 }
 2316 
 2317 // Check for shift by small constant as well
 2318 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2319   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2320       shift->in(2)->get_int() <= 3 &&
 2321       // Are there other uses besides address expressions?
 2322       !matcher->is_visited(shift)) {
 2323     address_visited.set(shift->_idx); // Flag as address_visited
 2324     mstack.push(shift->in(2), Matcher::Visit);
 2325     Node *conv = shift->in(1);
 2326 #ifdef _LP64
 2327     // Allow Matcher to match the rule which bypass
 2328     // ConvI2L operation for an array index on LP64
 2329     // if the index value is positive.
 2330     if (conv->Opcode() == Op_ConvI2L &&
 2331         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2332         // Are there other uses besides address expressions?
 2333         !matcher->is_visited(conv)) {
 2334       address_visited.set(conv->_idx); // Flag as address_visited
 2335       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2336     } else
 2337 #endif
 2338       mstack.push(conv, Matcher::Pre_Visit);
 2339     return true;
 2340   }
 2341   return false;
 2342 }
 2343 
 2344 // This function identifies sub-graphs in which a 'load' node is
 2345 // input to two different nodes, and such that it can be matched
 2346 // with BMI instructions like blsi, blsr, etc.
 2347 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2348 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2349 // refers to the same node.
 2350 //
 2351 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2352 // This is a temporary solution until we make DAGs expressible in ADL.
 2353 template<typename ConType>
 2354 class FusedPatternMatcher {
 2355   Node* _op1_node;
 2356   Node* _mop_node;
 2357   int _con_op;
 2358 
 2359   static int match_next(Node* n, int next_op, int next_op_idx) {
 2360     if (n->in(1) == NULL || n->in(2) == NULL) {
 2361       return -1;
 2362     }
 2363 
 2364     if (next_op_idx == -1) { // n is commutative, try rotations
 2365       if (n->in(1)->Opcode() == next_op) {
 2366         return 1;
 2367       } else if (n->in(2)->Opcode() == next_op) {
 2368         return 2;
 2369       }
 2370     } else {
 2371       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2372       if (n->in(next_op_idx)->Opcode() == next_op) {
 2373         return next_op_idx;
 2374       }
 2375     }
 2376     return -1;
 2377   }
 2378 
 2379  public:
 2380   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2381     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2382 
 2383   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2384              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2385              typename ConType::NativeType con_value) {
 2386     if (_op1_node->Opcode() != op1) {
 2387       return false;
 2388     }
 2389     if (_mop_node->outcnt() > 2) {
 2390       return false;
 2391     }
 2392     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2393     if (op1_op2_idx == -1) {
 2394       return false;
 2395     }
 2396     // Memory operation must be the other edge
 2397     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2398 
 2399     // Check that the mop node is really what we want
 2400     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2401       Node* op2_node = _op1_node->in(op1_op2_idx);
 2402       if (op2_node->outcnt() > 1) {
 2403         return false;
 2404       }
 2405       assert(op2_node->Opcode() == op2, "Should be");
 2406       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2407       if (op2_con_idx == -1) {
 2408         return false;
 2409       }
 2410       // Memory operation must be the other edge
 2411       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2412       // Check that the memory operation is the same node
 2413       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2414         // Now check the constant
 2415         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2416         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2417           return true;
 2418         }
 2419       }
 2420     }
 2421     return false;
 2422   }
 2423 };
 2424 
 2425 static bool is_bmi_pattern(Node* n, Node* m) {
 2426   assert(UseBMI1Instructions, "sanity");
 2427   if (n != NULL && m != NULL) {
 2428     if (m->Opcode() == Op_LoadI) {
 2429       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2430       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2431              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2432              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2433     } else if (m->Opcode() == Op_LoadL) {
 2434       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2435       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2436              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2437              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2438     }
 2439   }
 2440   return false;
 2441 }
 2442 
 2443 // Should the matcher clone input 'm' of node 'n'?
 2444 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2445   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2446   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2447     mstack.push(m, Visit);
 2448     return true;
 2449   }
 2450   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2451     mstack.push(m, Visit);           // m = ShiftCntV
 2452     return true;
 2453   }
 2454   return false;
 2455 }
 2456 
 2457 // Should the Matcher clone shifts on addressing modes, expecting them
 2458 // to be subsumed into complex addressing expressions or compute them
 2459 // into registers?
 2460 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2461   Node *off = m->in(AddPNode::Offset);
 2462   if (off->is_Con()) {
 2463     address_visited.test_set(m->_idx); // Flag as address_visited
 2464     Node *adr = m->in(AddPNode::Address);
 2465 
 2466     // Intel can handle 2 adds in addressing mode
 2467     // AtomicAdd is not an addressing expression.
 2468     // Cheap to find it by looking for screwy base.
 2469     if (adr->is_AddP() &&
 2470         !adr->in(AddPNode::Base)->is_top() &&
 2471         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2472         // Are there other uses besides address expressions?
 2473         !is_visited(adr)) {
 2474       address_visited.set(adr->_idx); // Flag as address_visited
 2475       Node *shift = adr->in(AddPNode::Offset);
 2476       if (!clone_shift(shift, this, mstack, address_visited)) {
 2477         mstack.push(shift, Pre_Visit);
 2478       }
 2479       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2480       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2481     } else {
 2482       mstack.push(adr, Pre_Visit);
 2483     }
 2484 
 2485     // Clone X+offset as it also folds into most addressing expressions
 2486     mstack.push(off, Visit);
 2487     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2488     return true;
 2489   } else if (clone_shift(off, this, mstack, address_visited)) {
 2490     address_visited.test_set(m->_idx); // Flag as address_visited
 2491     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2492     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2493     return true;
 2494   }
 2495   return false;
 2496 }
 2497 
 2498 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2499   switch (bt) {
 2500     case BoolTest::eq:
 2501       return Assembler::eq;
 2502     case BoolTest::ne:
 2503       return Assembler::neq;
 2504     case BoolTest::le:
 2505     case BoolTest::ule:
 2506       return Assembler::le;
 2507     case BoolTest::ge:
 2508     case BoolTest::uge:
 2509       return Assembler::nlt;
 2510     case BoolTest::lt:
 2511     case BoolTest::ult:
 2512       return Assembler::lt;
 2513     case BoolTest::gt:
 2514     case BoolTest::ugt:
 2515       return Assembler::nle;
 2516     default : ShouldNotReachHere(); return Assembler::_false;
 2517   }
 2518 }
 2519 
 2520 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2521   switch (bt) {
 2522   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2523   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2524   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2525   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2526   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2527   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2528   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2529   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2530   }
 2531 }
 2532 
 2533 // Helper methods for MachSpillCopyNode::implementation().
 2534 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2535                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2536   assert(ireg == Op_VecS || // 32bit vector
 2537          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2538          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2539          "no non-adjacent vector moves" );
 2540   if (cbuf) {
 2541     C2_MacroAssembler _masm(cbuf);
 2542     switch (ireg) {
 2543     case Op_VecS: // copy whole register
 2544     case Op_VecD:
 2545     case Op_VecX:
 2546 #ifndef _LP64
 2547       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2548 #else
 2549       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2550         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2551       } else {
 2552         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2553      }
 2554 #endif
 2555       break;
 2556     case Op_VecY:
 2557 #ifndef _LP64
 2558       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2559 #else
 2560       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2561         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2562       } else {
 2563         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2564      }
 2565 #endif
 2566       break;
 2567     case Op_VecZ:
 2568       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2569       break;
 2570     default:
 2571       ShouldNotReachHere();
 2572     }
 2573 #ifndef PRODUCT
 2574   } else {
 2575     switch (ireg) {
 2576     case Op_VecS:
 2577     case Op_VecD:
 2578     case Op_VecX:
 2579       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2580       break;
 2581     case Op_VecY:
 2582     case Op_VecZ:
 2583       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2584       break;
 2585     default:
 2586       ShouldNotReachHere();
 2587     }
 2588 #endif
 2589   }
 2590 }
 2591 
 2592 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2593                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2594   if (cbuf) {
 2595     C2_MacroAssembler _masm(cbuf);
 2596     if (is_load) {
 2597       switch (ireg) {
 2598       case Op_VecS:
 2599         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2600         break;
 2601       case Op_VecD:
 2602         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2603         break;
 2604       case Op_VecX:
 2605 #ifndef _LP64
 2606         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2607 #else
 2608         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2609           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2610         } else {
 2611           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2612           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2613         }
 2614 #endif
 2615         break;
 2616       case Op_VecY:
 2617 #ifndef _LP64
 2618         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2619 #else
 2620         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2621           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2622         } else {
 2623           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2624           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2625         }
 2626 #endif
 2627         break;
 2628       case Op_VecZ:
 2629         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2630         break;
 2631       default:
 2632         ShouldNotReachHere();
 2633       }
 2634     } else { // store
 2635       switch (ireg) {
 2636       case Op_VecS:
 2637         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2638         break;
 2639       case Op_VecD:
 2640         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2641         break;
 2642       case Op_VecX:
 2643 #ifndef _LP64
 2644         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2645 #else
 2646         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2647           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2648         }
 2649         else {
 2650           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2651         }
 2652 #endif
 2653         break;
 2654       case Op_VecY:
 2655 #ifndef _LP64
 2656         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2657 #else
 2658         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2659           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2660         }
 2661         else {
 2662           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2663         }
 2664 #endif
 2665         break;
 2666       case Op_VecZ:
 2667         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2668         break;
 2669       default:
 2670         ShouldNotReachHere();
 2671       }
 2672     }
 2673 #ifndef PRODUCT
 2674   } else {
 2675     if (is_load) {
 2676       switch (ireg) {
 2677       case Op_VecS:
 2678         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2679         break;
 2680       case Op_VecD:
 2681         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2682         break;
 2683        case Op_VecX:
 2684         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2685         break;
 2686       case Op_VecY:
 2687       case Op_VecZ:
 2688         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2689         break;
 2690       default:
 2691         ShouldNotReachHere();
 2692       }
 2693     } else { // store
 2694       switch (ireg) {
 2695       case Op_VecS:
 2696         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2697         break;
 2698       case Op_VecD:
 2699         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2700         break;
 2701        case Op_VecX:
 2702         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2703         break;
 2704       case Op_VecY:
 2705       case Op_VecZ:
 2706         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2707         break;
 2708       default:
 2709         ShouldNotReachHere();
 2710       }
 2711     }
 2712 #endif
 2713   }
 2714 }
 2715 
 2716 template <class T>
 2717 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2718   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2719   jvalue ele;
 2720   switch (bt) {
 2721     case T_BYTE:   ele.b = con; break;
 2722     case T_SHORT:  ele.s = con; break;
 2723     case T_INT:    ele.i = con; break;
 2724     case T_LONG:   ele.j = con; break;
 2725     case T_FLOAT:  ele.f = con; break;
 2726     case T_DOUBLE: ele.d = con; break;
 2727     default: ShouldNotReachHere();
 2728   }
 2729   for (int i = 0; i < len; i++) {
 2730     val->append(ele);
 2731   }
 2732   return val;
 2733 }
 2734 
 2735 static inline jlong high_bit_set(BasicType bt) {
 2736   switch (bt) {
 2737     case T_BYTE:  return 0x8080808080808080;
 2738     case T_SHORT: return 0x8000800080008000;
 2739     case T_INT:   return 0x8000000080000000;
 2740     case T_LONG:  return 0x8000000000000000;
 2741     default:
 2742       ShouldNotReachHere();
 2743       return 0;
 2744   }
 2745 }
 2746 
 2747 #ifndef PRODUCT
 2748   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2749     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2750   }
 2751 #endif
 2752 
 2753   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2754     C2_MacroAssembler _masm(&cbuf);
 2755     __ nop(_count);
 2756   }
 2757 
 2758   uint MachNopNode::size(PhaseRegAlloc*) const {
 2759     return _count;
 2760   }
 2761 
 2762 #ifndef PRODUCT
 2763   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2764     st->print("# breakpoint");
 2765   }
 2766 #endif
 2767 
 2768   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2769     C2_MacroAssembler _masm(&cbuf);
 2770     __ int3();
 2771   }
 2772 
 2773   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2774     return MachNode::size(ra_);
 2775   }
 2776 
 2777 %}
 2778 
 2779 encode %{
 2780 
 2781   enc_class call_epilog %{
 2782     C2_MacroAssembler _masm(&cbuf);
 2783     if (VerifyStackAtCalls) {
 2784       // Check that stack depth is unchanged: find majik cookie on stack
 2785       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2786       Label L;
 2787       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2788       __ jccb(Assembler::equal, L);
 2789       // Die if stack mismatch
 2790       __ int3();
 2791       __ bind(L);
 2792     }
 2793     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2794       C2_MacroAssembler _masm(&cbuf);
 2795       if (!_method->signature()->returns_null_free_inline_type()) {
 2796         // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2797         // Search for the corresponding projection, get the register and emit code that initialized it.
 2798         uint con = (tf()->range_cc()->cnt() - 1);
 2799         for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2800           ProjNode* proj = fast_out(i)->as_Proj();
 2801           if (proj->_con == con) {
 2802             // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2803             OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2804             VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2805             Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2806             __ testq(rax, rax);
 2807             __ set_byte_if_not_zero(toReg);
 2808             __ movzbl(toReg, toReg);
 2809             if (reg->is_stack()) {
 2810               int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2811               __ movq(Address(rsp, st_off), toReg);
 2812             }
 2813             break;
 2814           }
 2815         }
 2816       }
 2817       if (return_value_is_used()) {
 2818         // An inline type is returned as fields in multiple registers.
 2819         // Rax either contains an oop if the inline type is buffered or a pointer
 2820         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2821         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2822         // rax &= (rax & 1) - 1
 2823         __ movptr(rscratch1, rax);
 2824         __ andptr(rscratch1, 0x1);
 2825         __ subptr(rscratch1, 0x1);
 2826         __ andptr(rax, rscratch1);
 2827       }
 2828     }
 2829   %}
 2830 
 2831 %}
 2832 
 2833 // Operands for bound floating pointer register arguments
 2834 operand rxmm0() %{
 2835   constraint(ALLOC_IN_RC(xmm0_reg));
 2836   match(VecX);
 2837   format%{%}
 2838   interface(REG_INTER);
 2839 %}
 2840 
 2841 //----------OPERANDS-----------------------------------------------------------
 2842 // Operand definitions must precede instruction definitions for correct parsing
 2843 // in the ADLC because operands constitute user defined types which are used in
 2844 // instruction definitions.
 2845 
 2846 // Vectors
 2847 
 2848 // Dummy generic vector class. Should be used for all vector operands.
 2849 // Replaced with vec[SDXYZ] during post-selection pass.
 2850 operand vec() %{
 2851   constraint(ALLOC_IN_RC(dynamic));
 2852   match(VecX);
 2853   match(VecY);
 2854   match(VecZ);
 2855   match(VecS);
 2856   match(VecD);
 2857 
 2858   format %{ %}
 2859   interface(REG_INTER);
 2860 %}
 2861 
 2862 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2863 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2864 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2865 // runtime code generation via reg_class_dynamic.
 2866 operand legVec() %{
 2867   constraint(ALLOC_IN_RC(dynamic));
 2868   match(VecX);
 2869   match(VecY);
 2870   match(VecZ);
 2871   match(VecS);
 2872   match(VecD);
 2873 
 2874   format %{ %}
 2875   interface(REG_INTER);
 2876 %}
 2877 
 2878 // Replaces vec during post-selection cleanup. See above.
 2879 operand vecS() %{
 2880   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2881   match(VecS);
 2882 
 2883   format %{ %}
 2884   interface(REG_INTER);
 2885 %}
 2886 
 2887 // Replaces legVec during post-selection cleanup. See above.
 2888 operand legVecS() %{
 2889   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2890   match(VecS);
 2891 
 2892   format %{ %}
 2893   interface(REG_INTER);
 2894 %}
 2895 
 2896 // Replaces vec during post-selection cleanup. See above.
 2897 operand vecD() %{
 2898   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2899   match(VecD);
 2900 
 2901   format %{ %}
 2902   interface(REG_INTER);
 2903 %}
 2904 
 2905 // Replaces legVec during post-selection cleanup. See above.
 2906 operand legVecD() %{
 2907   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2908   match(VecD);
 2909 
 2910   format %{ %}
 2911   interface(REG_INTER);
 2912 %}
 2913 
 2914 // Replaces vec during post-selection cleanup. See above.
 2915 operand vecX() %{
 2916   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2917   match(VecX);
 2918 
 2919   format %{ %}
 2920   interface(REG_INTER);
 2921 %}
 2922 
 2923 // Replaces legVec during post-selection cleanup. See above.
 2924 operand legVecX() %{
 2925   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2926   match(VecX);
 2927 
 2928   format %{ %}
 2929   interface(REG_INTER);
 2930 %}
 2931 
 2932 // Replaces vec during post-selection cleanup. See above.
 2933 operand vecY() %{
 2934   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2935   match(VecY);
 2936 
 2937   format %{ %}
 2938   interface(REG_INTER);
 2939 %}
 2940 
 2941 // Replaces legVec during post-selection cleanup. See above.
 2942 operand legVecY() %{
 2943   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2944   match(VecY);
 2945 
 2946   format %{ %}
 2947   interface(REG_INTER);
 2948 %}
 2949 
 2950 // Replaces vec during post-selection cleanup. See above.
 2951 operand vecZ() %{
 2952   constraint(ALLOC_IN_RC(vectorz_reg));
 2953   match(VecZ);
 2954 
 2955   format %{ %}
 2956   interface(REG_INTER);
 2957 %}
 2958 
 2959 // Replaces legVec during post-selection cleanup. See above.
 2960 operand legVecZ() %{
 2961   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2962   match(VecZ);
 2963 
 2964   format %{ %}
 2965   interface(REG_INTER);
 2966 %}
 2967 
 2968 // Comparison Code for FP conditional move
 2969 operand cmpOp_vcmppd() %{
 2970   match(Bool);
 2971 
 2972   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
 2973             n->as_Bool()->_test._test != BoolTest::no_overflow);
 2974   format %{ "" %}
 2975   interface(COND_INTER) %{
 2976     equal        (0x0, "eq");
 2977     less         (0x1, "lt");
 2978     less_equal   (0x2, "le");
 2979     not_equal    (0xC, "ne");
 2980     greater_equal(0xD, "ge");
 2981     greater      (0xE, "gt");
 2982     //TODO cannot compile (adlc breaks) without two next lines with error:
 2983     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
 2984     // equal' for overflow.
 2985     overflow     (0x20, "o");  // not really supported by the instruction
 2986     no_overflow  (0x21, "no"); // not really supported by the instruction
 2987   %}
 2988 %}
 2989 
 2990 
 2991 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2992 
 2993 // ============================================================================
 2994 
 2995 instruct ShouldNotReachHere() %{
 2996   match(Halt);
 2997   format %{ "stop\t# ShouldNotReachHere" %}
 2998   ins_encode %{
 2999     if (is_reachable()) {
 3000       __ stop(_halt_reason);
 3001     }
 3002   %}
 3003   ins_pipe(pipe_slow);
 3004 %}
 3005 
 3006 // ============================================================================
 3007 
 3008 instruct addF_reg(regF dst, regF src) %{
 3009   predicate((UseSSE>=1) && (UseAVX == 0));
 3010   match(Set dst (AddF dst src));
 3011 
 3012   format %{ "addss   $dst, $src" %}
 3013   ins_cost(150);
 3014   ins_encode %{
 3015     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3016   %}
 3017   ins_pipe(pipe_slow);
 3018 %}
 3019 
 3020 instruct addF_mem(regF dst, memory src) %{
 3021   predicate((UseSSE>=1) && (UseAVX == 0));
 3022   match(Set dst (AddF dst (LoadF src)));
 3023 
 3024   format %{ "addss   $dst, $src" %}
 3025   ins_cost(150);
 3026   ins_encode %{
 3027     __ addss($dst$$XMMRegister, $src$$Address);
 3028   %}
 3029   ins_pipe(pipe_slow);
 3030 %}
 3031 
 3032 instruct addF_imm(regF dst, immF con) %{
 3033   predicate((UseSSE>=1) && (UseAVX == 0));
 3034   match(Set dst (AddF dst con));
 3035   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3036   ins_cost(150);
 3037   ins_encode %{
 3038     __ addss($dst$$XMMRegister, $constantaddress($con));
 3039   %}
 3040   ins_pipe(pipe_slow);
 3041 %}
 3042 
 3043 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3044   predicate(UseAVX > 0);
 3045   match(Set dst (AddF src1 src2));
 3046 
 3047   format %{ "vaddss  $dst, $src1, $src2" %}
 3048   ins_cost(150);
 3049   ins_encode %{
 3050     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3051   %}
 3052   ins_pipe(pipe_slow);
 3053 %}
 3054 
 3055 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3056   predicate(UseAVX > 0);
 3057   match(Set dst (AddF src1 (LoadF src2)));
 3058 
 3059   format %{ "vaddss  $dst, $src1, $src2" %}
 3060   ins_cost(150);
 3061   ins_encode %{
 3062     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3063   %}
 3064   ins_pipe(pipe_slow);
 3065 %}
 3066 
 3067 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3068   predicate(UseAVX > 0);
 3069   match(Set dst (AddF src con));
 3070 
 3071   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3072   ins_cost(150);
 3073   ins_encode %{
 3074     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3075   %}
 3076   ins_pipe(pipe_slow);
 3077 %}
 3078 
 3079 instruct addD_reg(regD dst, regD src) %{
 3080   predicate((UseSSE>=2) && (UseAVX == 0));
 3081   match(Set dst (AddD dst src));
 3082 
 3083   format %{ "addsd   $dst, $src" %}
 3084   ins_cost(150);
 3085   ins_encode %{
 3086     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3087   %}
 3088   ins_pipe(pipe_slow);
 3089 %}
 3090 
 3091 instruct addD_mem(regD dst, memory src) %{
 3092   predicate((UseSSE>=2) && (UseAVX == 0));
 3093   match(Set dst (AddD dst (LoadD src)));
 3094 
 3095   format %{ "addsd   $dst, $src" %}
 3096   ins_cost(150);
 3097   ins_encode %{
 3098     __ addsd($dst$$XMMRegister, $src$$Address);
 3099   %}
 3100   ins_pipe(pipe_slow);
 3101 %}
 3102 
 3103 instruct addD_imm(regD dst, immD con) %{
 3104   predicate((UseSSE>=2) && (UseAVX == 0));
 3105   match(Set dst (AddD dst con));
 3106   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3107   ins_cost(150);
 3108   ins_encode %{
 3109     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3110   %}
 3111   ins_pipe(pipe_slow);
 3112 %}
 3113 
 3114 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3115   predicate(UseAVX > 0);
 3116   match(Set dst (AddD src1 src2));
 3117 
 3118   format %{ "vaddsd  $dst, $src1, $src2" %}
 3119   ins_cost(150);
 3120   ins_encode %{
 3121     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3122   %}
 3123   ins_pipe(pipe_slow);
 3124 %}
 3125 
 3126 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3127   predicate(UseAVX > 0);
 3128   match(Set dst (AddD src1 (LoadD src2)));
 3129 
 3130   format %{ "vaddsd  $dst, $src1, $src2" %}
 3131   ins_cost(150);
 3132   ins_encode %{
 3133     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3134   %}
 3135   ins_pipe(pipe_slow);
 3136 %}
 3137 
 3138 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3139   predicate(UseAVX > 0);
 3140   match(Set dst (AddD src con));
 3141 
 3142   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3143   ins_cost(150);
 3144   ins_encode %{
 3145     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3146   %}
 3147   ins_pipe(pipe_slow);
 3148 %}
 3149 
 3150 instruct subF_reg(regF dst, regF src) %{
 3151   predicate((UseSSE>=1) && (UseAVX == 0));
 3152   match(Set dst (SubF dst src));
 3153 
 3154   format %{ "subss   $dst, $src" %}
 3155   ins_cost(150);
 3156   ins_encode %{
 3157     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3158   %}
 3159   ins_pipe(pipe_slow);
 3160 %}
 3161 
 3162 instruct subF_mem(regF dst, memory src) %{
 3163   predicate((UseSSE>=1) && (UseAVX == 0));
 3164   match(Set dst (SubF dst (LoadF src)));
 3165 
 3166   format %{ "subss   $dst, $src" %}
 3167   ins_cost(150);
 3168   ins_encode %{
 3169     __ subss($dst$$XMMRegister, $src$$Address);
 3170   %}
 3171   ins_pipe(pipe_slow);
 3172 %}
 3173 
 3174 instruct subF_imm(regF dst, immF con) %{
 3175   predicate((UseSSE>=1) && (UseAVX == 0));
 3176   match(Set dst (SubF dst con));
 3177   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3178   ins_cost(150);
 3179   ins_encode %{
 3180     __ subss($dst$$XMMRegister, $constantaddress($con));
 3181   %}
 3182   ins_pipe(pipe_slow);
 3183 %}
 3184 
 3185 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3186   predicate(UseAVX > 0);
 3187   match(Set dst (SubF src1 src2));
 3188 
 3189   format %{ "vsubss  $dst, $src1, $src2" %}
 3190   ins_cost(150);
 3191   ins_encode %{
 3192     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3193   %}
 3194   ins_pipe(pipe_slow);
 3195 %}
 3196 
 3197 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3198   predicate(UseAVX > 0);
 3199   match(Set dst (SubF src1 (LoadF src2)));
 3200 
 3201   format %{ "vsubss  $dst, $src1, $src2" %}
 3202   ins_cost(150);
 3203   ins_encode %{
 3204     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3205   %}
 3206   ins_pipe(pipe_slow);
 3207 %}
 3208 
 3209 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3210   predicate(UseAVX > 0);
 3211   match(Set dst (SubF src con));
 3212 
 3213   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3214   ins_cost(150);
 3215   ins_encode %{
 3216     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3217   %}
 3218   ins_pipe(pipe_slow);
 3219 %}
 3220 
 3221 instruct subD_reg(regD dst, regD src) %{
 3222   predicate((UseSSE>=2) && (UseAVX == 0));
 3223   match(Set dst (SubD dst src));
 3224 
 3225   format %{ "subsd   $dst, $src" %}
 3226   ins_cost(150);
 3227   ins_encode %{
 3228     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3229   %}
 3230   ins_pipe(pipe_slow);
 3231 %}
 3232 
 3233 instruct subD_mem(regD dst, memory src) %{
 3234   predicate((UseSSE>=2) && (UseAVX == 0));
 3235   match(Set dst (SubD dst (LoadD src)));
 3236 
 3237   format %{ "subsd   $dst, $src" %}
 3238   ins_cost(150);
 3239   ins_encode %{
 3240     __ subsd($dst$$XMMRegister, $src$$Address);
 3241   %}
 3242   ins_pipe(pipe_slow);
 3243 %}
 3244 
 3245 instruct subD_imm(regD dst, immD con) %{
 3246   predicate((UseSSE>=2) && (UseAVX == 0));
 3247   match(Set dst (SubD dst con));
 3248   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3249   ins_cost(150);
 3250   ins_encode %{
 3251     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3252   %}
 3253   ins_pipe(pipe_slow);
 3254 %}
 3255 
 3256 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3257   predicate(UseAVX > 0);
 3258   match(Set dst (SubD src1 src2));
 3259 
 3260   format %{ "vsubsd  $dst, $src1, $src2" %}
 3261   ins_cost(150);
 3262   ins_encode %{
 3263     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3264   %}
 3265   ins_pipe(pipe_slow);
 3266 %}
 3267 
 3268 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3269   predicate(UseAVX > 0);
 3270   match(Set dst (SubD src1 (LoadD src2)));
 3271 
 3272   format %{ "vsubsd  $dst, $src1, $src2" %}
 3273   ins_cost(150);
 3274   ins_encode %{
 3275     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3276   %}
 3277   ins_pipe(pipe_slow);
 3278 %}
 3279 
 3280 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3281   predicate(UseAVX > 0);
 3282   match(Set dst (SubD src con));
 3283 
 3284   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3285   ins_cost(150);
 3286   ins_encode %{
 3287     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3288   %}
 3289   ins_pipe(pipe_slow);
 3290 %}
 3291 
 3292 instruct mulF_reg(regF dst, regF src) %{
 3293   predicate((UseSSE>=1) && (UseAVX == 0));
 3294   match(Set dst (MulF dst src));
 3295 
 3296   format %{ "mulss   $dst, $src" %}
 3297   ins_cost(150);
 3298   ins_encode %{
 3299     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3300   %}
 3301   ins_pipe(pipe_slow);
 3302 %}
 3303 
 3304 instruct mulF_mem(regF dst, memory src) %{
 3305   predicate((UseSSE>=1) && (UseAVX == 0));
 3306   match(Set dst (MulF dst (LoadF src)));
 3307 
 3308   format %{ "mulss   $dst, $src" %}
 3309   ins_cost(150);
 3310   ins_encode %{
 3311     __ mulss($dst$$XMMRegister, $src$$Address);
 3312   %}
 3313   ins_pipe(pipe_slow);
 3314 %}
 3315 
 3316 instruct mulF_imm(regF dst, immF con) %{
 3317   predicate((UseSSE>=1) && (UseAVX == 0));
 3318   match(Set dst (MulF dst con));
 3319   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3320   ins_cost(150);
 3321   ins_encode %{
 3322     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3323   %}
 3324   ins_pipe(pipe_slow);
 3325 %}
 3326 
 3327 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3328   predicate(UseAVX > 0);
 3329   match(Set dst (MulF src1 src2));
 3330 
 3331   format %{ "vmulss  $dst, $src1, $src2" %}
 3332   ins_cost(150);
 3333   ins_encode %{
 3334     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3335   %}
 3336   ins_pipe(pipe_slow);
 3337 %}
 3338 
 3339 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3340   predicate(UseAVX > 0);
 3341   match(Set dst (MulF src1 (LoadF src2)));
 3342 
 3343   format %{ "vmulss  $dst, $src1, $src2" %}
 3344   ins_cost(150);
 3345   ins_encode %{
 3346     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3347   %}
 3348   ins_pipe(pipe_slow);
 3349 %}
 3350 
 3351 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3352   predicate(UseAVX > 0);
 3353   match(Set dst (MulF src con));
 3354 
 3355   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3356   ins_cost(150);
 3357   ins_encode %{
 3358     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3359   %}
 3360   ins_pipe(pipe_slow);
 3361 %}
 3362 
 3363 instruct mulD_reg(regD dst, regD src) %{
 3364   predicate((UseSSE>=2) && (UseAVX == 0));
 3365   match(Set dst (MulD dst src));
 3366 
 3367   format %{ "mulsd   $dst, $src" %}
 3368   ins_cost(150);
 3369   ins_encode %{
 3370     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3371   %}
 3372   ins_pipe(pipe_slow);
 3373 %}
 3374 
 3375 instruct mulD_mem(regD dst, memory src) %{
 3376   predicate((UseSSE>=2) && (UseAVX == 0));
 3377   match(Set dst (MulD dst (LoadD src)));
 3378 
 3379   format %{ "mulsd   $dst, $src" %}
 3380   ins_cost(150);
 3381   ins_encode %{
 3382     __ mulsd($dst$$XMMRegister, $src$$Address);
 3383   %}
 3384   ins_pipe(pipe_slow);
 3385 %}
 3386 
 3387 instruct mulD_imm(regD dst, immD con) %{
 3388   predicate((UseSSE>=2) && (UseAVX == 0));
 3389   match(Set dst (MulD dst con));
 3390   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3391   ins_cost(150);
 3392   ins_encode %{
 3393     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3394   %}
 3395   ins_pipe(pipe_slow);
 3396 %}
 3397 
 3398 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3399   predicate(UseAVX > 0);
 3400   match(Set dst (MulD src1 src2));
 3401 
 3402   format %{ "vmulsd  $dst, $src1, $src2" %}
 3403   ins_cost(150);
 3404   ins_encode %{
 3405     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3406   %}
 3407   ins_pipe(pipe_slow);
 3408 %}
 3409 
 3410 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3411   predicate(UseAVX > 0);
 3412   match(Set dst (MulD src1 (LoadD src2)));
 3413 
 3414   format %{ "vmulsd  $dst, $src1, $src2" %}
 3415   ins_cost(150);
 3416   ins_encode %{
 3417     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3418   %}
 3419   ins_pipe(pipe_slow);
 3420 %}
 3421 
 3422 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3423   predicate(UseAVX > 0);
 3424   match(Set dst (MulD src con));
 3425 
 3426   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3427   ins_cost(150);
 3428   ins_encode %{
 3429     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3430   %}
 3431   ins_pipe(pipe_slow);
 3432 %}
 3433 
 3434 instruct divF_reg(regF dst, regF src) %{
 3435   predicate((UseSSE>=1) && (UseAVX == 0));
 3436   match(Set dst (DivF dst src));
 3437 
 3438   format %{ "divss   $dst, $src" %}
 3439   ins_cost(150);
 3440   ins_encode %{
 3441     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3442   %}
 3443   ins_pipe(pipe_slow);
 3444 %}
 3445 
 3446 instruct divF_mem(regF dst, memory src) %{
 3447   predicate((UseSSE>=1) && (UseAVX == 0));
 3448   match(Set dst (DivF dst (LoadF src)));
 3449 
 3450   format %{ "divss   $dst, $src" %}
 3451   ins_cost(150);
 3452   ins_encode %{
 3453     __ divss($dst$$XMMRegister, $src$$Address);
 3454   %}
 3455   ins_pipe(pipe_slow);
 3456 %}
 3457 
 3458 instruct divF_imm(regF dst, immF con) %{
 3459   predicate((UseSSE>=1) && (UseAVX == 0));
 3460   match(Set dst (DivF dst con));
 3461   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3462   ins_cost(150);
 3463   ins_encode %{
 3464     __ divss($dst$$XMMRegister, $constantaddress($con));
 3465   %}
 3466   ins_pipe(pipe_slow);
 3467 %}
 3468 
 3469 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3470   predicate(UseAVX > 0);
 3471   match(Set dst (DivF src1 src2));
 3472 
 3473   format %{ "vdivss  $dst, $src1, $src2" %}
 3474   ins_cost(150);
 3475   ins_encode %{
 3476     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3477   %}
 3478   ins_pipe(pipe_slow);
 3479 %}
 3480 
 3481 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3482   predicate(UseAVX > 0);
 3483   match(Set dst (DivF src1 (LoadF src2)));
 3484 
 3485   format %{ "vdivss  $dst, $src1, $src2" %}
 3486   ins_cost(150);
 3487   ins_encode %{
 3488     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3489   %}
 3490   ins_pipe(pipe_slow);
 3491 %}
 3492 
 3493 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3494   predicate(UseAVX > 0);
 3495   match(Set dst (DivF src con));
 3496 
 3497   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3498   ins_cost(150);
 3499   ins_encode %{
 3500     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3501   %}
 3502   ins_pipe(pipe_slow);
 3503 %}
 3504 
 3505 instruct divD_reg(regD dst, regD src) %{
 3506   predicate((UseSSE>=2) && (UseAVX == 0));
 3507   match(Set dst (DivD dst src));
 3508 
 3509   format %{ "divsd   $dst, $src" %}
 3510   ins_cost(150);
 3511   ins_encode %{
 3512     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3513   %}
 3514   ins_pipe(pipe_slow);
 3515 %}
 3516 
 3517 instruct divD_mem(regD dst, memory src) %{
 3518   predicate((UseSSE>=2) && (UseAVX == 0));
 3519   match(Set dst (DivD dst (LoadD src)));
 3520 
 3521   format %{ "divsd   $dst, $src" %}
 3522   ins_cost(150);
 3523   ins_encode %{
 3524     __ divsd($dst$$XMMRegister, $src$$Address);
 3525   %}
 3526   ins_pipe(pipe_slow);
 3527 %}
 3528 
 3529 instruct divD_imm(regD dst, immD con) %{
 3530   predicate((UseSSE>=2) && (UseAVX == 0));
 3531   match(Set dst (DivD dst con));
 3532   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3533   ins_cost(150);
 3534   ins_encode %{
 3535     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3536   %}
 3537   ins_pipe(pipe_slow);
 3538 %}
 3539 
 3540 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3541   predicate(UseAVX > 0);
 3542   match(Set dst (DivD src1 src2));
 3543 
 3544   format %{ "vdivsd  $dst, $src1, $src2" %}
 3545   ins_cost(150);
 3546   ins_encode %{
 3547     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3548   %}
 3549   ins_pipe(pipe_slow);
 3550 %}
 3551 
 3552 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3553   predicate(UseAVX > 0);
 3554   match(Set dst (DivD src1 (LoadD src2)));
 3555 
 3556   format %{ "vdivsd  $dst, $src1, $src2" %}
 3557   ins_cost(150);
 3558   ins_encode %{
 3559     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3560   %}
 3561   ins_pipe(pipe_slow);
 3562 %}
 3563 
 3564 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3565   predicate(UseAVX > 0);
 3566   match(Set dst (DivD src con));
 3567 
 3568   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3569   ins_cost(150);
 3570   ins_encode %{
 3571     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3572   %}
 3573   ins_pipe(pipe_slow);
 3574 %}
 3575 
 3576 instruct absF_reg(regF dst) %{
 3577   predicate((UseSSE>=1) && (UseAVX == 0));
 3578   match(Set dst (AbsF dst));
 3579   ins_cost(150);
 3580   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3581   ins_encode %{
 3582     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3583   %}
 3584   ins_pipe(pipe_slow);
 3585 %}
 3586 
 3587 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3588   predicate(UseAVX > 0);
 3589   match(Set dst (AbsF src));
 3590   ins_cost(150);
 3591   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3592   ins_encode %{
 3593     int vlen_enc = Assembler::AVX_128bit;
 3594     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3595               ExternalAddress(float_signmask()), vlen_enc);
 3596   %}
 3597   ins_pipe(pipe_slow);
 3598 %}
 3599 
 3600 instruct absD_reg(regD dst) %{
 3601   predicate((UseSSE>=2) && (UseAVX == 0));
 3602   match(Set dst (AbsD dst));
 3603   ins_cost(150);
 3604   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3605             "# abs double by sign masking" %}
 3606   ins_encode %{
 3607     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3608   %}
 3609   ins_pipe(pipe_slow);
 3610 %}
 3611 
 3612 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3613   predicate(UseAVX > 0);
 3614   match(Set dst (AbsD src));
 3615   ins_cost(150);
 3616   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3617             "# abs double by sign masking" %}
 3618   ins_encode %{
 3619     int vlen_enc = Assembler::AVX_128bit;
 3620     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3621               ExternalAddress(double_signmask()), vlen_enc);
 3622   %}
 3623   ins_pipe(pipe_slow);
 3624 %}
 3625 
 3626 instruct negF_reg(regF dst) %{
 3627   predicate((UseSSE>=1) && (UseAVX == 0));
 3628   match(Set dst (NegF dst));
 3629   ins_cost(150);
 3630   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3631   ins_encode %{
 3632     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3633   %}
 3634   ins_pipe(pipe_slow);
 3635 %}
 3636 
 3637 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3638   predicate(UseAVX > 0);
 3639   match(Set dst (NegF src));
 3640   ins_cost(150);
 3641   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3642   ins_encode %{
 3643     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3644                  ExternalAddress(float_signflip()));
 3645   %}
 3646   ins_pipe(pipe_slow);
 3647 %}
 3648 
 3649 instruct negD_reg(regD dst) %{
 3650   predicate((UseSSE>=2) && (UseAVX == 0));
 3651   match(Set dst (NegD dst));
 3652   ins_cost(150);
 3653   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3654             "# neg double by sign flipping" %}
 3655   ins_encode %{
 3656     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3657   %}
 3658   ins_pipe(pipe_slow);
 3659 %}
 3660 
 3661 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3662   predicate(UseAVX > 0);
 3663   match(Set dst (NegD src));
 3664   ins_cost(150);
 3665   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3666             "# neg double by sign flipping" %}
 3667   ins_encode %{
 3668     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3669                  ExternalAddress(double_signflip()));
 3670   %}
 3671   ins_pipe(pipe_slow);
 3672 %}
 3673 
 3674 // sqrtss instruction needs destination register to be pre initialized for best performance
 3675 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3676 instruct sqrtF_reg(regF dst) %{
 3677   predicate(UseSSE>=1);
 3678   match(Set dst (SqrtF dst));
 3679   format %{ "sqrtss  $dst, $dst" %}
 3680   ins_encode %{
 3681     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3682   %}
 3683   ins_pipe(pipe_slow);
 3684 %}
 3685 
 3686 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3687 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3688 instruct sqrtD_reg(regD dst) %{
 3689   predicate(UseSSE>=2);
 3690   match(Set dst (SqrtD dst));
 3691   format %{ "sqrtsd  $dst, $dst" %}
 3692   ins_encode %{
 3693     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3694   %}
 3695   ins_pipe(pipe_slow);
 3696 %}
 3697 
 3698 instruct convF2HF_reg_reg(rRegI dst, regF src, regF tmp) %{
 3699   effect(TEMP tmp);
 3700   match(Set dst (ConvF2HF src));
 3701   ins_cost(125);
 3702   format %{ "vcvtps2ph $dst,$src \t using $tmp as TEMP"%}
 3703   ins_encode %{
 3704     __ vcvtps2ph($tmp$$XMMRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3705     __ movdl($dst$$Register, $tmp$$XMMRegister);
 3706     __ movswl($dst$$Register, $dst$$Register);
 3707   %}
 3708   ins_pipe( pipe_slow );
 3709 %}
 3710 
 3711 instruct convF2HF_mem_reg(memory mem, regF src, kReg ktmp, rRegI rtmp) %{
 3712   predicate((UseAVX > 2) && VM_Version::supports_avx512vl());
 3713   effect(TEMP ktmp, TEMP rtmp);
 3714   match(Set mem (StoreC mem (ConvF2HF src)));
 3715   format %{ "evcvtps2ph $mem,$src \t using $ktmp and $rtmp as TEMP" %}
 3716   ins_encode %{
 3717     __ movl($rtmp$$Register, 0x1);
 3718     __ kmovwl($ktmp$$KRegister, $rtmp$$Register);
 3719     __ evcvtps2ph($mem$$Address, $ktmp$$KRegister, $src$$XMMRegister, 0x04, Assembler::AVX_128bit);
 3720   %}
 3721   ins_pipe( pipe_slow );
 3722 %}
 3723 
 3724 instruct vconvF2HF(vec dst, vec src) %{
 3725   match(Set dst (VectorCastF2HF src));
 3726   format %{ "vector_conv_F2HF $dst $src" %}
 3727   ins_encode %{
 3728     int vlen_enc = vector_length_encoding(this, $src);
 3729     __ vcvtps2ph($dst$$XMMRegister, $src$$XMMRegister, 0x04, vlen_enc);
 3730   %}
 3731   ins_pipe( pipe_slow );
 3732 %}
 3733 
 3734 instruct vconvF2HF_mem_reg(memory mem, vec src) %{
 3735   match(Set mem (StoreVector mem (VectorCastF2HF src)));
 3736   format %{ "vcvtps2ph $mem,$src" %}
 3737   ins_encode %{
 3738     int vlen_enc = vector_length_encoding(this, $src);
 3739     __ vcvtps2ph($mem$$Address, $src$$XMMRegister, 0x04, vlen_enc);
 3740   %}
 3741   ins_pipe( pipe_slow );
 3742 %}
 3743 
 3744 instruct convHF2F_reg_reg(regF dst, rRegI src) %{
 3745   match(Set dst (ConvHF2F src));
 3746   format %{ "vcvtph2ps $dst,$src" %}
 3747   ins_encode %{
 3748     __ movdl($dst$$XMMRegister, $src$$Register);
 3749     __ vcvtph2ps($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 3750   %}
 3751   ins_pipe( pipe_slow );
 3752 %}
 3753 
 3754 instruct vconvHF2F_reg_mem(vec dst, memory mem) %{
 3755   match(Set dst (VectorCastHF2F (LoadVector mem)));
 3756   format %{ "vcvtph2ps $dst,$mem" %}
 3757   ins_encode %{
 3758     int vlen_enc = vector_length_encoding(this);
 3759     __ vcvtph2ps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 3760   %}
 3761   ins_pipe( pipe_slow );
 3762 %}
 3763 
 3764 instruct vconvHF2F(vec dst, vec src) %{
 3765   match(Set dst (VectorCastHF2F src));
 3766   ins_cost(125);
 3767   format %{ "vector_conv_HF2F $dst,$src" %}
 3768   ins_encode %{
 3769     int vlen_enc = vector_length_encoding(this);
 3770     __ vcvtph2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 3771   %}
 3772   ins_pipe( pipe_slow );
 3773 %}
 3774 
 3775 // ---------------------------------------- VectorReinterpret ------------------------------------
 3776 instruct reinterpret_mask(kReg dst) %{
 3777   predicate(n->bottom_type()->isa_vectmask() &&
 3778             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3779   match(Set dst (VectorReinterpret dst));
 3780   ins_cost(125);
 3781   format %{ "vector_reinterpret $dst\t!" %}
 3782   ins_encode %{
 3783     // empty
 3784   %}
 3785   ins_pipe( pipe_slow );
 3786 %}
 3787 
 3788 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3789   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3790             n->bottom_type()->isa_vectmask() &&
 3791             n->in(1)->bottom_type()->isa_vectmask() &&
 3792             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3793             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3794   match(Set dst (VectorReinterpret src));
 3795   effect(TEMP xtmp);
 3796   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3797   ins_encode %{
 3798      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3799      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3800      assert(src_sz == dst_sz , "src and dst size mismatch");
 3801      int vlen_enc = vector_length_encoding(src_sz);
 3802      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3803      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3804   %}
 3805   ins_pipe( pipe_slow );
 3806 %}
 3807 
 3808 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3809   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3810             n->bottom_type()->isa_vectmask() &&
 3811             n->in(1)->bottom_type()->isa_vectmask() &&
 3812             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3813              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3814             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3815   match(Set dst (VectorReinterpret src));
 3816   effect(TEMP xtmp);
 3817   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3818   ins_encode %{
 3819      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3820      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3821      assert(src_sz == dst_sz , "src and dst size mismatch");
 3822      int vlen_enc = vector_length_encoding(src_sz);
 3823      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3824      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3825   %}
 3826   ins_pipe( pipe_slow );
 3827 %}
 3828 
 3829 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3830   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3831             n->bottom_type()->isa_vectmask() &&
 3832             n->in(1)->bottom_type()->isa_vectmask() &&
 3833             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3834              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3835             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3836   match(Set dst (VectorReinterpret src));
 3837   effect(TEMP xtmp);
 3838   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3839   ins_encode %{
 3840      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3841      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3842      assert(src_sz == dst_sz , "src and dst size mismatch");
 3843      int vlen_enc = vector_length_encoding(src_sz);
 3844      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3845      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3846   %}
 3847   ins_pipe( pipe_slow );
 3848 %}
 3849 
 3850 instruct reinterpret(vec dst) %{
 3851   predicate(!n->bottom_type()->isa_vectmask() &&
 3852             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3853   match(Set dst (VectorReinterpret dst));
 3854   ins_cost(125);
 3855   format %{ "vector_reinterpret $dst\t!" %}
 3856   ins_encode %{
 3857     // empty
 3858   %}
 3859   ins_pipe( pipe_slow );
 3860 %}
 3861 
 3862 instruct reinterpret_expand(vec dst, vec src) %{
 3863   predicate(UseAVX == 0 &&
 3864             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3865   match(Set dst (VectorReinterpret src));
 3866   ins_cost(125);
 3867   effect(TEMP dst);
 3868   format %{ "vector_reinterpret_expand $dst,$src" %}
 3869   ins_encode %{
 3870     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3871     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3872 
 3873     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3874     if (src_vlen_in_bytes == 4) {
 3875       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3876     } else {
 3877       assert(src_vlen_in_bytes == 8, "");
 3878       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3879     }
 3880     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3881   %}
 3882   ins_pipe( pipe_slow );
 3883 %}
 3884 
 3885 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3886   predicate(UseAVX > 0 &&
 3887             !n->bottom_type()->isa_vectmask() &&
 3888             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3889             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3890   match(Set dst (VectorReinterpret src));
 3891   ins_cost(125);
 3892   format %{ "vector_reinterpret_expand $dst,$src" %}
 3893   ins_encode %{
 3894     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3895   %}
 3896   ins_pipe( pipe_slow );
 3897 %}
 3898 
 3899 
 3900 instruct vreinterpret_expand(legVec dst, vec src) %{
 3901   predicate(UseAVX > 0 &&
 3902             !n->bottom_type()->isa_vectmask() &&
 3903             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3904             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3905   match(Set dst (VectorReinterpret src));
 3906   ins_cost(125);
 3907   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3908   ins_encode %{
 3909     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3910       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3911       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3912       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3913       default: ShouldNotReachHere();
 3914     }
 3915   %}
 3916   ins_pipe( pipe_slow );
 3917 %}
 3918 
 3919 instruct reinterpret_shrink(vec dst, legVec src) %{
 3920   predicate(!n->bottom_type()->isa_vectmask() &&
 3921             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3922   match(Set dst (VectorReinterpret src));
 3923   ins_cost(125);
 3924   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3925   ins_encode %{
 3926     switch (Matcher::vector_length_in_bytes(this)) {
 3927       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3928       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3929       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3930       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3931       default: ShouldNotReachHere();
 3932     }
 3933   %}
 3934   ins_pipe( pipe_slow );
 3935 %}
 3936 
 3937 // ----------------------------------------------------------------------------------------------------
 3938 
 3939 #ifdef _LP64
 3940 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3941   match(Set dst (RoundDoubleMode src rmode));
 3942   format %{ "roundsd $dst,$src" %}
 3943   ins_cost(150);
 3944   ins_encode %{
 3945     assert(UseSSE >= 4, "required");
 3946     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3947   %}
 3948   ins_pipe(pipe_slow);
 3949 %}
 3950 
 3951 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3952   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3953   format %{ "roundsd $dst,$src" %}
 3954   ins_cost(150);
 3955   ins_encode %{
 3956     assert(UseSSE >= 4, "required");
 3957     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3958   %}
 3959   ins_pipe(pipe_slow);
 3960 %}
 3961 
 3962 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3963   match(Set dst (RoundDoubleMode con rmode));
 3964   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3965   ins_cost(150);
 3966   ins_encode %{
 3967     assert(UseSSE >= 4, "required");
 3968     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3969   %}
 3970   ins_pipe(pipe_slow);
 3971 %}
 3972 
 3973 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3974   predicate(Matcher::vector_length(n) < 8);
 3975   match(Set dst (RoundDoubleModeV src rmode));
 3976   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3977   ins_encode %{
 3978     assert(UseAVX > 0, "required");
 3979     int vlen_enc = vector_length_encoding(this);
 3980     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3981   %}
 3982   ins_pipe( pipe_slow );
 3983 %}
 3984 
 3985 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3986   predicate(Matcher::vector_length(n) == 8);
 3987   match(Set dst (RoundDoubleModeV src rmode));
 3988   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3989   ins_encode %{
 3990     assert(UseAVX > 2, "required");
 3991     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3992   %}
 3993   ins_pipe( pipe_slow );
 3994 %}
 3995 
 3996 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3997   predicate(Matcher::vector_length(n) < 8);
 3998   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3999   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 4000   ins_encode %{
 4001     assert(UseAVX > 0, "required");
 4002     int vlen_enc = vector_length_encoding(this);
 4003     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 4004   %}
 4005   ins_pipe( pipe_slow );
 4006 %}
 4007 
 4008 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 4009   predicate(Matcher::vector_length(n) == 8);
 4010   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 4011   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 4012   ins_encode %{
 4013     assert(UseAVX > 2, "required");
 4014     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 4015   %}
 4016   ins_pipe( pipe_slow );
 4017 %}
 4018 #endif // _LP64
 4019 
 4020 instruct onspinwait() %{
 4021   match(OnSpinWait);
 4022   ins_cost(200);
 4023 
 4024   format %{
 4025     $$template
 4026     $$emit$$"pause\t! membar_onspinwait"
 4027   %}
 4028   ins_encode %{
 4029     __ pause();
 4030   %}
 4031   ins_pipe(pipe_slow);
 4032 %}
 4033 
 4034 // a * b + c
 4035 instruct fmaD_reg(regD a, regD b, regD c) %{
 4036   predicate(UseFMA);
 4037   match(Set c (FmaD  c (Binary a b)));
 4038   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 4039   ins_cost(150);
 4040   ins_encode %{
 4041     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4042   %}
 4043   ins_pipe( pipe_slow );
 4044 %}
 4045 
 4046 // a * b + c
 4047 instruct fmaF_reg(regF a, regF b, regF c) %{
 4048   predicate(UseFMA);
 4049   match(Set c (FmaF  c (Binary a b)));
 4050   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 4051   ins_cost(150);
 4052   ins_encode %{
 4053     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 4054   %}
 4055   ins_pipe( pipe_slow );
 4056 %}
 4057 
 4058 // ====================VECTOR INSTRUCTIONS=====================================
 4059 
 4060 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 4061 instruct MoveVec2Leg(legVec dst, vec src) %{
 4062   match(Set dst src);
 4063   format %{ "" %}
 4064   ins_encode %{
 4065     ShouldNotReachHere();
 4066   %}
 4067   ins_pipe( fpu_reg_reg );
 4068 %}
 4069 
 4070 instruct MoveLeg2Vec(vec dst, legVec src) %{
 4071   match(Set dst src);
 4072   format %{ "" %}
 4073   ins_encode %{
 4074     ShouldNotReachHere();
 4075   %}
 4076   ins_pipe( fpu_reg_reg );
 4077 %}
 4078 
 4079 // ============================================================================
 4080 
 4081 // Load vectors generic operand pattern
 4082 instruct loadV(vec dst, memory mem) %{
 4083   match(Set dst (LoadVector mem));
 4084   ins_cost(125);
 4085   format %{ "load_vector $dst,$mem" %}
 4086   ins_encode %{
 4087     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4088   %}
 4089   ins_pipe( pipe_slow );
 4090 %}
 4091 
 4092 // Store vectors generic operand pattern.
 4093 instruct storeV(memory mem, vec src) %{
 4094   match(Set mem (StoreVector mem src));
 4095   ins_cost(145);
 4096   format %{ "store_vector $mem,$src\n\t" %}
 4097   ins_encode %{
 4098     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4099       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4100       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4101       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4102       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4103       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4104       default: ShouldNotReachHere();
 4105     }
 4106   %}
 4107   ins_pipe( pipe_slow );
 4108 %}
 4109 
 4110 // ---------------------------------------- Gather ------------------------------------
 4111 
 4112 // Gather INT, LONG, FLOAT, DOUBLE
 4113 
 4114 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4115   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4116   match(Set dst (LoadVectorGather mem idx));
 4117   effect(TEMP dst, TEMP tmp, TEMP mask);
 4118   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4119   ins_encode %{
 4120     assert(UseAVX >= 2, "sanity");
 4121 
 4122     int vlen_enc = vector_length_encoding(this);
 4123     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4124 
 4125     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 4126     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4127 
 4128     if (vlen_enc == Assembler::AVX_128bit) {
 4129       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4130     } else {
 4131       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4132     }
 4133     __ lea($tmp$$Register, $mem$$Address);
 4134     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4135   %}
 4136   ins_pipe( pipe_slow );
 4137 %}
 4138 
 4139 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4140   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4141   match(Set dst (LoadVectorGather mem idx));
 4142   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4143   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4144   ins_encode %{
 4145     assert(UseAVX > 2, "sanity");
 4146 
 4147     int vlen_enc = vector_length_encoding(this);
 4148     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4149 
 4150     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4151 
 4152     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4153     __ lea($tmp$$Register, $mem$$Address);
 4154     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4155   %}
 4156   ins_pipe( pipe_slow );
 4157 %}
 4158 
 4159 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4160   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4161   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4162   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4163   ins_encode %{
 4164     assert(UseAVX > 2, "sanity");
 4165     int vlen_enc = vector_length_encoding(this);
 4166     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4167     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4168     // Note: Since gather instruction partially updates the opmask register used
 4169     // for predication hense moving mask operand to a temporary.
 4170     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4171     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4172     __ lea($tmp$$Register, $mem$$Address);
 4173     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4174   %}
 4175   ins_pipe( pipe_slow );
 4176 %}
 4177 // ====================Scatter=======================================
 4178 
 4179 // Scatter INT, LONG, FLOAT, DOUBLE
 4180 
 4181 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4182   predicate(UseAVX > 2);
 4183   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4184   effect(TEMP tmp, TEMP ktmp);
 4185   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4186   ins_encode %{
 4187     int vlen_enc = vector_length_encoding(this, $src);
 4188     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4189 
 4190     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4191     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4192 
 4193     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4194     __ lea($tmp$$Register, $mem$$Address);
 4195     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4196   %}
 4197   ins_pipe( pipe_slow );
 4198 %}
 4199 
 4200 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4201   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4202   effect(TEMP tmp, TEMP ktmp);
 4203   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4204   ins_encode %{
 4205     int vlen_enc = vector_length_encoding(this, $src);
 4206     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4207     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4208     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4209     // Note: Since scatter instruction partially updates the opmask register used
 4210     // for predication hense moving mask operand to a temporary.
 4211     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4212     __ lea($tmp$$Register, $mem$$Address);
 4213     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4214   %}
 4215   ins_pipe( pipe_slow );
 4216 %}
 4217 
 4218 // ====================REPLICATE=======================================
 4219 
 4220 // Replicate byte scalar to be vector
 4221 instruct vReplB_reg(vec dst, rRegI src) %{
 4222   predicate(UseAVX >= 2);
 4223   match(Set dst (ReplicateB src));
 4224   format %{ "replicateB $dst,$src" %}
 4225   ins_encode %{
 4226     uint vlen = Matcher::vector_length(this);
 4227     int vlen_enc = vector_length_encoding(this);
 4228     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4229       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4230       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4231     } else {
 4232       __ movdl($dst$$XMMRegister, $src$$Register);
 4233       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4234     }
 4235   %}
 4236   ins_pipe( pipe_slow );
 4237 %}
 4238 
 4239 instruct ReplB_reg(vec dst, rRegI src) %{
 4240   predicate(UseAVX < 2);
 4241   match(Set dst (ReplicateB src));
 4242   format %{ "replicateB $dst,$src" %}
 4243   ins_encode %{
 4244     uint vlen = Matcher::vector_length(this);
 4245     __ movdl($dst$$XMMRegister, $src$$Register);
 4246     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4247     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4248     if (vlen >= 16) {
 4249       assert(vlen == 16, "");
 4250       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4251     }
 4252   %}
 4253   ins_pipe( pipe_slow );
 4254 %}
 4255 
 4256 instruct ReplB_mem(vec dst, memory mem) %{
 4257   predicate(UseAVX >= 2);
 4258   match(Set dst (ReplicateB (LoadB mem)));
 4259   format %{ "replicateB $dst,$mem" %}
 4260   ins_encode %{
 4261     int vlen_enc = vector_length_encoding(this);
 4262     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4263   %}
 4264   ins_pipe( pipe_slow );
 4265 %}
 4266 
 4267 // ====================ReplicateS=======================================
 4268 
 4269 instruct vReplS_reg(vec dst, rRegI src) %{
 4270   predicate(UseAVX >= 2);
 4271   match(Set dst (ReplicateS src));
 4272   format %{ "replicateS $dst,$src" %}
 4273   ins_encode %{
 4274     uint vlen = Matcher::vector_length(this);
 4275     int vlen_enc = vector_length_encoding(this);
 4276     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4277       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4278       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4279     } else {
 4280       __ movdl($dst$$XMMRegister, $src$$Register);
 4281       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4282     }
 4283   %}
 4284   ins_pipe( pipe_slow );
 4285 %}
 4286 
 4287 instruct ReplS_reg(vec dst, rRegI src) %{
 4288   predicate(UseAVX < 2);
 4289   match(Set dst (ReplicateS src));
 4290   format %{ "replicateS $dst,$src" %}
 4291   ins_encode %{
 4292     uint vlen = Matcher::vector_length(this);
 4293     int vlen_enc = vector_length_encoding(this);
 4294     __ movdl($dst$$XMMRegister, $src$$Register);
 4295     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4296     if (vlen >= 8) {
 4297       assert(vlen == 8, "");
 4298       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4299     }
 4300   %}
 4301   ins_pipe( pipe_slow );
 4302 %}
 4303 
 4304 instruct ReplS_mem(vec dst, memory mem) %{
 4305   predicate(UseAVX >= 2);
 4306   match(Set dst (ReplicateS (LoadS mem)));
 4307   format %{ "replicateS $dst,$mem" %}
 4308   ins_encode %{
 4309     int vlen_enc = vector_length_encoding(this);
 4310     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4311   %}
 4312   ins_pipe( pipe_slow );
 4313 %}
 4314 
 4315 // ====================ReplicateI=======================================
 4316 
 4317 instruct ReplI_reg(vec dst, rRegI src) %{
 4318   match(Set dst (ReplicateI src));
 4319   format %{ "replicateI $dst,$src" %}
 4320   ins_encode %{
 4321     uint vlen = Matcher::vector_length(this);
 4322     int vlen_enc = vector_length_encoding(this);
 4323     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4324       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4325     } else if (VM_Version::supports_avx2()) {
 4326       __ movdl($dst$$XMMRegister, $src$$Register);
 4327       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4328     } else {
 4329       __ movdl($dst$$XMMRegister, $src$$Register);
 4330       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4331     }
 4332   %}
 4333   ins_pipe( pipe_slow );
 4334 %}
 4335 
 4336 instruct ReplI_mem(vec dst, memory mem) %{
 4337   match(Set dst (ReplicateI (LoadI mem)));
 4338   format %{ "replicateI $dst,$mem" %}
 4339   ins_encode %{
 4340     int vlen_enc = vector_length_encoding(this);
 4341     if (VM_Version::supports_avx2()) {
 4342       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4343     } else if (VM_Version::supports_avx()) {
 4344       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4345     } else {
 4346       __ movdl($dst$$XMMRegister, $mem$$Address);
 4347       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4348     }
 4349   %}
 4350   ins_pipe( pipe_slow );
 4351 %}
 4352 
 4353 instruct ReplI_imm(vec dst, immI con) %{
 4354   match(Set dst (ReplicateB con));
 4355   match(Set dst (ReplicateS con));
 4356   match(Set dst (ReplicateI con));
 4357   format %{ "replicateI $dst,$con" %}
 4358   ins_encode %{
 4359     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4360         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4361             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4362                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4363     BasicType bt = Matcher::vector_element_basic_type(this);
 4364     int vlen = Matcher::vector_length_in_bytes(this);
 4365     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4366   %}
 4367   ins_pipe( pipe_slow );
 4368 %}
 4369 
 4370 // Replicate scalar zero to be vector
 4371 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4372   match(Set dst (ReplicateB zero));
 4373   match(Set dst (ReplicateS zero));
 4374   match(Set dst (ReplicateI zero));
 4375   format %{ "replicateI $dst,$zero" %}
 4376   ins_encode %{
 4377     int vlen_enc = vector_length_encoding(this);
 4378     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4379       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4380     } else {
 4381       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4382     }
 4383   %}
 4384   ins_pipe( fpu_reg_reg );
 4385 %}
 4386 
 4387 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4388   predicate(UseSSE >= 2);
 4389   match(Set dst (ReplicateB con));
 4390   match(Set dst (ReplicateS con));
 4391   match(Set dst (ReplicateI con));
 4392   format %{ "vallones $dst" %}
 4393   ins_encode %{
 4394     int vector_len = vector_length_encoding(this);
 4395     __ vallones($dst$$XMMRegister, vector_len);
 4396   %}
 4397   ins_pipe( pipe_slow );
 4398 %}
 4399 
 4400 // ====================ReplicateL=======================================
 4401 
 4402 #ifdef _LP64
 4403 // Replicate long (8 byte) scalar to be vector
 4404 instruct ReplL_reg(vec dst, rRegL src) %{
 4405   match(Set dst (ReplicateL src));
 4406   format %{ "replicateL $dst,$src" %}
 4407   ins_encode %{
 4408     int vlen = Matcher::vector_length(this);
 4409     int vlen_enc = vector_length_encoding(this);
 4410     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4411       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4412     } else if (VM_Version::supports_avx2()) {
 4413       __ movdq($dst$$XMMRegister, $src$$Register);
 4414       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4415     } else {
 4416       __ movdq($dst$$XMMRegister, $src$$Register);
 4417       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4418     }
 4419   %}
 4420   ins_pipe( pipe_slow );
 4421 %}
 4422 #else // _LP64
 4423 // Replicate long (8 byte) scalar to be vector
 4424 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4425   predicate(Matcher::vector_length(n) <= 4);
 4426   match(Set dst (ReplicateL src));
 4427   effect(TEMP dst, USE src, TEMP tmp);
 4428   format %{ "replicateL $dst,$src" %}
 4429   ins_encode %{
 4430     uint vlen = Matcher::vector_length(this);
 4431     if (vlen == 2) {
 4432       __ movdl($dst$$XMMRegister, $src$$Register);
 4433       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4434       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4435       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4436     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4437       int vlen_enc = Assembler::AVX_256bit;
 4438       __ movdl($dst$$XMMRegister, $src$$Register);
 4439       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4440       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4441       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4442     } else {
 4443       __ movdl($dst$$XMMRegister, $src$$Register);
 4444       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4445       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4446       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4447       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4448     }
 4449   %}
 4450   ins_pipe( pipe_slow );
 4451 %}
 4452 
 4453 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4454   predicate(Matcher::vector_length(n) == 8);
 4455   match(Set dst (ReplicateL src));
 4456   effect(TEMP dst, USE src, TEMP tmp);
 4457   format %{ "replicateL $dst,$src" %}
 4458   ins_encode %{
 4459     if (VM_Version::supports_avx512vl()) {
 4460       __ movdl($dst$$XMMRegister, $src$$Register);
 4461       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4462       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4463       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4464       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4465       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4466     } else {
 4467       int vlen_enc = Assembler::AVX_512bit;
 4468       __ movdl($dst$$XMMRegister, $src$$Register);
 4469       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4470       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4471       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4472     }
 4473   %}
 4474   ins_pipe( pipe_slow );
 4475 %}
 4476 #endif // _LP64
 4477 
 4478 instruct ReplL_mem(vec dst, memory mem) %{
 4479   match(Set dst (ReplicateL (LoadL mem)));
 4480   format %{ "replicateL $dst,$mem" %}
 4481   ins_encode %{
 4482     int vlen_enc = vector_length_encoding(this);
 4483     if (VM_Version::supports_avx2()) {
 4484       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4485     } else if (VM_Version::supports_sse3()) {
 4486       __ movddup($dst$$XMMRegister, $mem$$Address);
 4487     } else {
 4488       __ movq($dst$$XMMRegister, $mem$$Address);
 4489       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4490     }
 4491   %}
 4492   ins_pipe( pipe_slow );
 4493 %}
 4494 
 4495 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4496 instruct ReplL_imm(vec dst, immL con) %{
 4497   match(Set dst (ReplicateL con));
 4498   format %{ "replicateL $dst,$con" %}
 4499   ins_encode %{
 4500     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4501     int vlen = Matcher::vector_length_in_bytes(this);
 4502     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4503   %}
 4504   ins_pipe( pipe_slow );
 4505 %}
 4506 
 4507 instruct ReplL_zero(vec dst, immL0 zero) %{
 4508   match(Set dst (ReplicateL zero));
 4509   format %{ "replicateL $dst,$zero" %}
 4510   ins_encode %{
 4511     int vlen_enc = vector_length_encoding(this);
 4512     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4513       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4514     } else {
 4515       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4516     }
 4517   %}
 4518   ins_pipe( fpu_reg_reg );
 4519 %}
 4520 
 4521 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4522   predicate(UseSSE >= 2);
 4523   match(Set dst (ReplicateL con));
 4524   format %{ "vallones $dst" %}
 4525   ins_encode %{
 4526     int vector_len = vector_length_encoding(this);
 4527     __ vallones($dst$$XMMRegister, vector_len);
 4528   %}
 4529   ins_pipe( pipe_slow );
 4530 %}
 4531 
 4532 // ====================ReplicateF=======================================
 4533 
 4534 instruct vReplF_reg(vec dst, vlRegF src) %{
 4535   predicate(UseAVX > 0);
 4536   match(Set dst (ReplicateF src));
 4537   format %{ "replicateF $dst,$src" %}
 4538   ins_encode %{
 4539     uint vlen = Matcher::vector_length(this);
 4540     int vlen_enc = vector_length_encoding(this);
 4541     if (vlen <= 4) {
 4542       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4543     } else if (VM_Version::supports_avx2()) {
 4544       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4545     } else {
 4546       assert(vlen == 8, "sanity");
 4547       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4548       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4549     }
 4550   %}
 4551   ins_pipe( pipe_slow );
 4552 %}
 4553 
 4554 instruct ReplF_reg(vec dst, vlRegF src) %{
 4555   predicate(UseAVX == 0);
 4556   match(Set dst (ReplicateF src));
 4557   format %{ "replicateF $dst,$src" %}
 4558   ins_encode %{
 4559     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4560   %}
 4561   ins_pipe( pipe_slow );
 4562 %}
 4563 
 4564 instruct ReplF_mem(vec dst, memory mem) %{
 4565   predicate(UseAVX > 0);
 4566   match(Set dst (ReplicateF (LoadF mem)));
 4567   format %{ "replicateF $dst,$mem" %}
 4568   ins_encode %{
 4569     int vlen_enc = vector_length_encoding(this);
 4570     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4571   %}
 4572   ins_pipe( pipe_slow );
 4573 %}
 4574 
 4575 // Replicate float scalar immediate to be vector by loading from const table.
 4576 instruct ReplF_imm(vec dst, immF con) %{
 4577   match(Set dst (ReplicateF con));
 4578   format %{ "replicateF $dst,$con" %}
 4579   ins_encode %{
 4580     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4581         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4582     int vlen = Matcher::vector_length_in_bytes(this);
 4583     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4584   %}
 4585   ins_pipe( pipe_slow );
 4586 %}
 4587 
 4588 instruct ReplF_zero(vec dst, immF0 zero) %{
 4589   match(Set dst (ReplicateF zero));
 4590   format %{ "replicateF $dst,$zero" %}
 4591   ins_encode %{
 4592     int vlen_enc = vector_length_encoding(this);
 4593     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4594       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4595     } else {
 4596       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4597     }
 4598   %}
 4599   ins_pipe( fpu_reg_reg );
 4600 %}
 4601 
 4602 // ====================ReplicateD=======================================
 4603 
 4604 // Replicate double (8 bytes) scalar to be vector
 4605 instruct vReplD_reg(vec dst, vlRegD src) %{
 4606   predicate(UseSSE >= 3);
 4607   match(Set dst (ReplicateD src));
 4608   format %{ "replicateD $dst,$src" %}
 4609   ins_encode %{
 4610     uint vlen = Matcher::vector_length(this);
 4611     int vlen_enc = vector_length_encoding(this);
 4612     if (vlen <= 2) {
 4613       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4614     } else if (VM_Version::supports_avx2()) {
 4615       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4616     } else {
 4617       assert(vlen == 4, "sanity");
 4618       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4619       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4620     }
 4621   %}
 4622   ins_pipe( pipe_slow );
 4623 %}
 4624 
 4625 instruct ReplD_reg(vec dst, vlRegD src) %{
 4626   predicate(UseSSE < 3);
 4627   match(Set dst (ReplicateD src));
 4628   format %{ "replicateD $dst,$src" %}
 4629   ins_encode %{
 4630     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4631   %}
 4632   ins_pipe( pipe_slow );
 4633 %}
 4634 
 4635 instruct ReplD_mem(vec dst, memory mem) %{
 4636   predicate(UseSSE >= 3);
 4637   match(Set dst (ReplicateD (LoadD mem)));
 4638   format %{ "replicateD $dst,$mem" %}
 4639   ins_encode %{
 4640     if (Matcher::vector_length(this) >= 4) {
 4641       int vlen_enc = vector_length_encoding(this);
 4642       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4643     } else {
 4644       __ movddup($dst$$XMMRegister, $mem$$Address);
 4645     }
 4646   %}
 4647   ins_pipe( pipe_slow );
 4648 %}
 4649 
 4650 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4651 instruct ReplD_imm(vec dst, immD con) %{
 4652   match(Set dst (ReplicateD con));
 4653   format %{ "replicateD $dst,$con" %}
 4654   ins_encode %{
 4655     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4656     int vlen = Matcher::vector_length_in_bytes(this);
 4657     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4658   %}
 4659   ins_pipe( pipe_slow );
 4660 %}
 4661 
 4662 instruct ReplD_zero(vec dst, immD0 zero) %{
 4663   match(Set dst (ReplicateD zero));
 4664   format %{ "replicateD $dst,$zero" %}
 4665   ins_encode %{
 4666     int vlen_enc = vector_length_encoding(this);
 4667     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4668       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4669     } else {
 4670       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4671     }
 4672   %}
 4673   ins_pipe( fpu_reg_reg );
 4674 %}
 4675 
 4676 // ====================VECTOR INSERT=======================================
 4677 
 4678 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4679   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4680   match(Set dst (VectorInsert (Binary dst val) idx));
 4681   format %{ "vector_insert $dst,$val,$idx" %}
 4682   ins_encode %{
 4683     assert(UseSSE >= 4, "required");
 4684     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4685 
 4686     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4687 
 4688     assert(is_integral_type(elem_bt), "");
 4689     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4690 
 4691     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4692   %}
 4693   ins_pipe( pipe_slow );
 4694 %}
 4695 
 4696 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4697   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4698   match(Set dst (VectorInsert (Binary src val) idx));
 4699   effect(TEMP vtmp);
 4700   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4701   ins_encode %{
 4702     int vlen_enc = Assembler::AVX_256bit;
 4703     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4704     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4705     int log2epr = log2(elem_per_lane);
 4706 
 4707     assert(is_integral_type(elem_bt), "sanity");
 4708     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4709 
 4710     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4711     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4712     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4713     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4714     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4715   %}
 4716   ins_pipe( pipe_slow );
 4717 %}
 4718 
 4719 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4720   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4721   match(Set dst (VectorInsert (Binary src val) idx));
 4722   effect(TEMP vtmp);
 4723   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4724   ins_encode %{
 4725     assert(UseAVX > 2, "sanity");
 4726 
 4727     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4728     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4729     int log2epr = log2(elem_per_lane);
 4730 
 4731     assert(is_integral_type(elem_bt), "");
 4732     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4733 
 4734     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4735     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4736     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4737     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4738     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4739   %}
 4740   ins_pipe( pipe_slow );
 4741 %}
 4742 
 4743 #ifdef _LP64
 4744 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4745   predicate(Matcher::vector_length(n) == 2);
 4746   match(Set dst (VectorInsert (Binary dst val) idx));
 4747   format %{ "vector_insert $dst,$val,$idx" %}
 4748   ins_encode %{
 4749     assert(UseSSE >= 4, "required");
 4750     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4751     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4752 
 4753     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4754   %}
 4755   ins_pipe( pipe_slow );
 4756 %}
 4757 
 4758 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4759   predicate(Matcher::vector_length(n) == 4);
 4760   match(Set dst (VectorInsert (Binary src val) idx));
 4761   effect(TEMP vtmp);
 4762   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4763   ins_encode %{
 4764     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4765     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4766 
 4767     uint x_idx = $idx$$constant & right_n_bits(1);
 4768     uint y_idx = ($idx$$constant >> 1) & 1;
 4769     int vlen_enc = Assembler::AVX_256bit;
 4770     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4771     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4772     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4773   %}
 4774   ins_pipe( pipe_slow );
 4775 %}
 4776 
 4777 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4778   predicate(Matcher::vector_length(n) == 8);
 4779   match(Set dst (VectorInsert (Binary src val) idx));
 4780   effect(TEMP vtmp);
 4781   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4782   ins_encode %{
 4783     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4784     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4785 
 4786     uint x_idx = $idx$$constant & right_n_bits(1);
 4787     uint y_idx = ($idx$$constant >> 1) & 3;
 4788     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4789     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4790     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4791   %}
 4792   ins_pipe( pipe_slow );
 4793 %}
 4794 #endif
 4795 
 4796 instruct insertF(vec dst, regF val, immU8 idx) %{
 4797   predicate(Matcher::vector_length(n) < 8);
 4798   match(Set dst (VectorInsert (Binary dst val) idx));
 4799   format %{ "vector_insert $dst,$val,$idx" %}
 4800   ins_encode %{
 4801     assert(UseSSE >= 4, "sanity");
 4802 
 4803     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4804     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4805 
 4806     uint x_idx = $idx$$constant & right_n_bits(2);
 4807     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4808   %}
 4809   ins_pipe( pipe_slow );
 4810 %}
 4811 
 4812 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4813   predicate(Matcher::vector_length(n) >= 8);
 4814   match(Set dst (VectorInsert (Binary src val) idx));
 4815   effect(TEMP vtmp);
 4816   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4817   ins_encode %{
 4818     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4819     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4820 
 4821     int vlen = Matcher::vector_length(this);
 4822     uint x_idx = $idx$$constant & right_n_bits(2);
 4823     if (vlen == 8) {
 4824       uint y_idx = ($idx$$constant >> 2) & 1;
 4825       int vlen_enc = Assembler::AVX_256bit;
 4826       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4827       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4828       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4829     } else {
 4830       assert(vlen == 16, "sanity");
 4831       uint y_idx = ($idx$$constant >> 2) & 3;
 4832       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4833       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4834       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4835     }
 4836   %}
 4837   ins_pipe( pipe_slow );
 4838 %}
 4839 
 4840 #ifdef _LP64
 4841 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4842   predicate(Matcher::vector_length(n) == 2);
 4843   match(Set dst (VectorInsert (Binary dst val) idx));
 4844   effect(TEMP tmp);
 4845   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4846   ins_encode %{
 4847     assert(UseSSE >= 4, "sanity");
 4848     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4849     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4850 
 4851     __ movq($tmp$$Register, $val$$XMMRegister);
 4852     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4853   %}
 4854   ins_pipe( pipe_slow );
 4855 %}
 4856 
 4857 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4858   predicate(Matcher::vector_length(n) == 4);
 4859   match(Set dst (VectorInsert (Binary src val) idx));
 4860   effect(TEMP vtmp, TEMP tmp);
 4861   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4862   ins_encode %{
 4863     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4864     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4865 
 4866     uint x_idx = $idx$$constant & right_n_bits(1);
 4867     uint y_idx = ($idx$$constant >> 1) & 1;
 4868     int vlen_enc = Assembler::AVX_256bit;
 4869     __ movq($tmp$$Register, $val$$XMMRegister);
 4870     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4871     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4872     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4873   %}
 4874   ins_pipe( pipe_slow );
 4875 %}
 4876 
 4877 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4878   predicate(Matcher::vector_length(n) == 8);
 4879   match(Set dst (VectorInsert (Binary src val) idx));
 4880   effect(TEMP tmp, TEMP vtmp);
 4881   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4882   ins_encode %{
 4883     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4884     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4885 
 4886     uint x_idx = $idx$$constant & right_n_bits(1);
 4887     uint y_idx = ($idx$$constant >> 1) & 3;
 4888     __ movq($tmp$$Register, $val$$XMMRegister);
 4889     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4890     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4891     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4892   %}
 4893   ins_pipe( pipe_slow );
 4894 %}
 4895 #endif
 4896 
 4897 // ====================REDUCTION ARITHMETIC=======================================
 4898 
 4899 // =======================Int Reduction==========================================
 4900 
 4901 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4902   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4903   match(Set dst (AddReductionVI src1 src2));
 4904   match(Set dst (MulReductionVI src1 src2));
 4905   match(Set dst (AndReductionV  src1 src2));
 4906   match(Set dst ( OrReductionV  src1 src2));
 4907   match(Set dst (XorReductionV  src1 src2));
 4908   match(Set dst (MinReductionV  src1 src2));
 4909   match(Set dst (MaxReductionV  src1 src2));
 4910   effect(TEMP vtmp1, TEMP vtmp2);
 4911   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4912   ins_encode %{
 4913     int opcode = this->ideal_Opcode();
 4914     int vlen = Matcher::vector_length(this, $src2);
 4915     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4916   %}
 4917   ins_pipe( pipe_slow );
 4918 %}
 4919 
 4920 // =======================Long Reduction==========================================
 4921 
 4922 #ifdef _LP64
 4923 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4924   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4925   match(Set dst (AddReductionVL src1 src2));
 4926   match(Set dst (MulReductionVL src1 src2));
 4927   match(Set dst (AndReductionV  src1 src2));
 4928   match(Set dst ( OrReductionV  src1 src2));
 4929   match(Set dst (XorReductionV  src1 src2));
 4930   match(Set dst (MinReductionV  src1 src2));
 4931   match(Set dst (MaxReductionV  src1 src2));
 4932   effect(TEMP vtmp1, TEMP vtmp2);
 4933   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4934   ins_encode %{
 4935     int opcode = this->ideal_Opcode();
 4936     int vlen = Matcher::vector_length(this, $src2);
 4937     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4938   %}
 4939   ins_pipe( pipe_slow );
 4940 %}
 4941 
 4942 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4943   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4944   match(Set dst (AddReductionVL src1 src2));
 4945   match(Set dst (MulReductionVL src1 src2));
 4946   match(Set dst (AndReductionV  src1 src2));
 4947   match(Set dst ( OrReductionV  src1 src2));
 4948   match(Set dst (XorReductionV  src1 src2));
 4949   match(Set dst (MinReductionV  src1 src2));
 4950   match(Set dst (MaxReductionV  src1 src2));
 4951   effect(TEMP vtmp1, TEMP vtmp2);
 4952   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4953   ins_encode %{
 4954     int opcode = this->ideal_Opcode();
 4955     int vlen = Matcher::vector_length(this, $src2);
 4956     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4957   %}
 4958   ins_pipe( pipe_slow );
 4959 %}
 4960 #endif // _LP64
 4961 
 4962 // =======================Float Reduction==========================================
 4963 
 4964 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4965   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4966   match(Set dst (AddReductionVF dst src));
 4967   match(Set dst (MulReductionVF dst src));
 4968   effect(TEMP dst, TEMP vtmp);
 4969   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4970   ins_encode %{
 4971     int opcode = this->ideal_Opcode();
 4972     int vlen = Matcher::vector_length(this, $src);
 4973     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4974   %}
 4975   ins_pipe( pipe_slow );
 4976 %}
 4977 
 4978 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4979   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4980   match(Set dst (AddReductionVF dst src));
 4981   match(Set dst (MulReductionVF dst src));
 4982   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4983   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4984   ins_encode %{
 4985     int opcode = this->ideal_Opcode();
 4986     int vlen = Matcher::vector_length(this, $src);
 4987     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4988   %}
 4989   ins_pipe( pipe_slow );
 4990 %}
 4991 
 4992 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4993   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4994   match(Set dst (AddReductionVF dst src));
 4995   match(Set dst (MulReductionVF dst src));
 4996   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4997   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4998   ins_encode %{
 4999     int opcode = this->ideal_Opcode();
 5000     int vlen = Matcher::vector_length(this, $src);
 5001     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5002   %}
 5003   ins_pipe( pipe_slow );
 5004 %}
 5005 
 5006 // =======================Double Reduction==========================================
 5007 
 5008 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 5009   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 5010   match(Set dst (AddReductionVD dst src));
 5011   match(Set dst (MulReductionVD dst src));
 5012   effect(TEMP dst, TEMP vtmp);
 5013   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 5014   ins_encode %{
 5015     int opcode = this->ideal_Opcode();
 5016     int vlen = Matcher::vector_length(this, $src);
 5017     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 5018 %}
 5019   ins_pipe( pipe_slow );
 5020 %}
 5021 
 5022 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 5023   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 5024   match(Set dst (AddReductionVD dst src));
 5025   match(Set dst (MulReductionVD dst src));
 5026   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5027   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5028   ins_encode %{
 5029     int opcode = this->ideal_Opcode();
 5030     int vlen = Matcher::vector_length(this, $src);
 5031     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5032   %}
 5033   ins_pipe( pipe_slow );
 5034 %}
 5035 
 5036 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 5037   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 5038   match(Set dst (AddReductionVD dst src));
 5039   match(Set dst (MulReductionVD dst src));
 5040   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5041   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 5042   ins_encode %{
 5043     int opcode = this->ideal_Opcode();
 5044     int vlen = Matcher::vector_length(this, $src);
 5045     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5046   %}
 5047   ins_pipe( pipe_slow );
 5048 %}
 5049 
 5050 // =======================Byte Reduction==========================================
 5051 
 5052 #ifdef _LP64
 5053 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5054   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 5055   match(Set dst (AddReductionVI src1 src2));
 5056   match(Set dst (AndReductionV  src1 src2));
 5057   match(Set dst ( OrReductionV  src1 src2));
 5058   match(Set dst (XorReductionV  src1 src2));
 5059   match(Set dst (MinReductionV  src1 src2));
 5060   match(Set dst (MaxReductionV  src1 src2));
 5061   effect(TEMP vtmp1, TEMP vtmp2);
 5062   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5063   ins_encode %{
 5064     int opcode = this->ideal_Opcode();
 5065     int vlen = Matcher::vector_length(this, $src2);
 5066     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5067   %}
 5068   ins_pipe( pipe_slow );
 5069 %}
 5070 
 5071 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5072   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 5073   match(Set dst (AddReductionVI src1 src2));
 5074   match(Set dst (AndReductionV  src1 src2));
 5075   match(Set dst ( OrReductionV  src1 src2));
 5076   match(Set dst (XorReductionV  src1 src2));
 5077   match(Set dst (MinReductionV  src1 src2));
 5078   match(Set dst (MaxReductionV  src1 src2));
 5079   effect(TEMP vtmp1, TEMP vtmp2);
 5080   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5081   ins_encode %{
 5082     int opcode = this->ideal_Opcode();
 5083     int vlen = Matcher::vector_length(this, $src2);
 5084     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5085   %}
 5086   ins_pipe( pipe_slow );
 5087 %}
 5088 #endif
 5089 
 5090 // =======================Short Reduction==========================================
 5091 
 5092 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5093   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5094   match(Set dst (AddReductionVI src1 src2));
 5095   match(Set dst (MulReductionVI src1 src2));
 5096   match(Set dst (AndReductionV  src1 src2));
 5097   match(Set dst ( OrReductionV  src1 src2));
 5098   match(Set dst (XorReductionV  src1 src2));
 5099   match(Set dst (MinReductionV  src1 src2));
 5100   match(Set dst (MaxReductionV  src1 src2));
 5101   effect(TEMP vtmp1, TEMP vtmp2);
 5102   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5103   ins_encode %{
 5104     int opcode = this->ideal_Opcode();
 5105     int vlen = Matcher::vector_length(this, $src2);
 5106     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5107   %}
 5108   ins_pipe( pipe_slow );
 5109 %}
 5110 
 5111 // =======================Mul Reduction==========================================
 5112 
 5113 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5114   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5115             Matcher::vector_length(n->in(2)) <= 32); // src2
 5116   match(Set dst (MulReductionVI src1 src2));
 5117   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5118   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5119   ins_encode %{
 5120     int opcode = this->ideal_Opcode();
 5121     int vlen = Matcher::vector_length(this, $src2);
 5122     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5123   %}
 5124   ins_pipe( pipe_slow );
 5125 %}
 5126 
 5127 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5128   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5129             Matcher::vector_length(n->in(2)) == 64); // src2
 5130   match(Set dst (MulReductionVI src1 src2));
 5131   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5132   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5133   ins_encode %{
 5134     int opcode = this->ideal_Opcode();
 5135     int vlen = Matcher::vector_length(this, $src2);
 5136     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5137   %}
 5138   ins_pipe( pipe_slow );
 5139 %}
 5140 
 5141 //--------------------Min/Max Float Reduction --------------------
 5142 // Float Min Reduction
 5143 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5144                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5145   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5146             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5147              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5148             Matcher::vector_length(n->in(2)) == 2);
 5149   match(Set dst (MinReductionV src1 src2));
 5150   match(Set dst (MaxReductionV src1 src2));
 5151   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5152   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5153   ins_encode %{
 5154     assert(UseAVX > 0, "sanity");
 5155 
 5156     int opcode = this->ideal_Opcode();
 5157     int vlen = Matcher::vector_length(this, $src2);
 5158     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5159                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5160   %}
 5161   ins_pipe( pipe_slow );
 5162 %}
 5163 
 5164 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5165                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5166   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5167             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5168              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5169             Matcher::vector_length(n->in(2)) >= 4);
 5170   match(Set dst (MinReductionV src1 src2));
 5171   match(Set dst (MaxReductionV src1 src2));
 5172   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5173   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5174   ins_encode %{
 5175     assert(UseAVX > 0, "sanity");
 5176 
 5177     int opcode = this->ideal_Opcode();
 5178     int vlen = Matcher::vector_length(this, $src2);
 5179     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5180                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5181   %}
 5182   ins_pipe( pipe_slow );
 5183 %}
 5184 
 5185 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5186                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5187   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5188             Matcher::vector_length(n->in(2)) == 2);
 5189   match(Set dst (MinReductionV dst src));
 5190   match(Set dst (MaxReductionV dst src));
 5191   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5192   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5193   ins_encode %{
 5194     assert(UseAVX > 0, "sanity");
 5195 
 5196     int opcode = this->ideal_Opcode();
 5197     int vlen = Matcher::vector_length(this, $src);
 5198     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5199                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5200   %}
 5201   ins_pipe( pipe_slow );
 5202 %}
 5203 
 5204 
 5205 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5206                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5207   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5208             Matcher::vector_length(n->in(2)) >= 4);
 5209   match(Set dst (MinReductionV dst src));
 5210   match(Set dst (MaxReductionV dst src));
 5211   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5212   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5213   ins_encode %{
 5214     assert(UseAVX > 0, "sanity");
 5215 
 5216     int opcode = this->ideal_Opcode();
 5217     int vlen = Matcher::vector_length(this, $src);
 5218     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5219                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5220   %}
 5221   ins_pipe( pipe_slow );
 5222 %}
 5223 
 5224 
 5225 //--------------------Min Double Reduction --------------------
 5226 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5227                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5228                             rFlagsReg cr) %{
 5229   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5230             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5231              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5232             Matcher::vector_length(n->in(2)) == 2);
 5233   match(Set dst (MinReductionV src1 src2));
 5234   match(Set dst (MaxReductionV src1 src2));
 5235   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5236   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5237   ins_encode %{
 5238     assert(UseAVX > 0, "sanity");
 5239 
 5240     int opcode = this->ideal_Opcode();
 5241     int vlen = Matcher::vector_length(this, $src2);
 5242     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5243                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5244   %}
 5245   ins_pipe( pipe_slow );
 5246 %}
 5247 
 5248 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5249                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5250                            rFlagsReg cr) %{
 5251   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5252             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5253              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5254             Matcher::vector_length(n->in(2)) >= 4);
 5255   match(Set dst (MinReductionV src1 src2));
 5256   match(Set dst (MaxReductionV src1 src2));
 5257   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5258   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5259   ins_encode %{
 5260     assert(UseAVX > 0, "sanity");
 5261 
 5262     int opcode = this->ideal_Opcode();
 5263     int vlen = Matcher::vector_length(this, $src2);
 5264     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5265                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5266   %}
 5267   ins_pipe( pipe_slow );
 5268 %}
 5269 
 5270 
 5271 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5272                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5273                                rFlagsReg cr) %{
 5274   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5275             Matcher::vector_length(n->in(2)) == 2);
 5276   match(Set dst (MinReductionV dst src));
 5277   match(Set dst (MaxReductionV dst src));
 5278   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5279   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5280   ins_encode %{
 5281     assert(UseAVX > 0, "sanity");
 5282 
 5283     int opcode = this->ideal_Opcode();
 5284     int vlen = Matcher::vector_length(this, $src);
 5285     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5286                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5287   %}
 5288   ins_pipe( pipe_slow );
 5289 %}
 5290 
 5291 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5292                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5293                               rFlagsReg cr) %{
 5294   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5295             Matcher::vector_length(n->in(2)) >= 4);
 5296   match(Set dst (MinReductionV dst src));
 5297   match(Set dst (MaxReductionV dst src));
 5298   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5299   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5300   ins_encode %{
 5301     assert(UseAVX > 0, "sanity");
 5302 
 5303     int opcode = this->ideal_Opcode();
 5304     int vlen = Matcher::vector_length(this, $src);
 5305     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5306                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5307   %}
 5308   ins_pipe( pipe_slow );
 5309 %}
 5310 
 5311 // ====================VECTOR ARITHMETIC=======================================
 5312 
 5313 // --------------------------------- ADD --------------------------------------
 5314 
 5315 // Bytes vector add
 5316 instruct vaddB(vec dst, vec src) %{
 5317   predicate(UseAVX == 0);
 5318   match(Set dst (AddVB dst src));
 5319   format %{ "paddb   $dst,$src\t! add packedB" %}
 5320   ins_encode %{
 5321     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5322   %}
 5323   ins_pipe( pipe_slow );
 5324 %}
 5325 
 5326 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5327   predicate(UseAVX > 0);
 5328   match(Set dst (AddVB src1 src2));
 5329   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5330   ins_encode %{
 5331     int vlen_enc = vector_length_encoding(this);
 5332     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5333   %}
 5334   ins_pipe( pipe_slow );
 5335 %}
 5336 
 5337 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5338   predicate((UseAVX > 0) &&
 5339             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5340   match(Set dst (AddVB src (LoadVector mem)));
 5341   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5342   ins_encode %{
 5343     int vlen_enc = vector_length_encoding(this);
 5344     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5345   %}
 5346   ins_pipe( pipe_slow );
 5347 %}
 5348 
 5349 // Shorts/Chars vector add
 5350 instruct vaddS(vec dst, vec src) %{
 5351   predicate(UseAVX == 0);
 5352   match(Set dst (AddVS dst src));
 5353   format %{ "paddw   $dst,$src\t! add packedS" %}
 5354   ins_encode %{
 5355     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5356   %}
 5357   ins_pipe( pipe_slow );
 5358 %}
 5359 
 5360 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5361   predicate(UseAVX > 0);
 5362   match(Set dst (AddVS src1 src2));
 5363   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5364   ins_encode %{
 5365     int vlen_enc = vector_length_encoding(this);
 5366     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5367   %}
 5368   ins_pipe( pipe_slow );
 5369 %}
 5370 
 5371 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5372   predicate((UseAVX > 0) &&
 5373             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5374   match(Set dst (AddVS src (LoadVector mem)));
 5375   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5376   ins_encode %{
 5377     int vlen_enc = vector_length_encoding(this);
 5378     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5379   %}
 5380   ins_pipe( pipe_slow );
 5381 %}
 5382 
 5383 // Integers vector add
 5384 instruct vaddI(vec dst, vec src) %{
 5385   predicate(UseAVX == 0);
 5386   match(Set dst (AddVI dst src));
 5387   format %{ "paddd   $dst,$src\t! add packedI" %}
 5388   ins_encode %{
 5389     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5390   %}
 5391   ins_pipe( pipe_slow );
 5392 %}
 5393 
 5394 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5395   predicate(UseAVX > 0);
 5396   match(Set dst (AddVI src1 src2));
 5397   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5398   ins_encode %{
 5399     int vlen_enc = vector_length_encoding(this);
 5400     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5401   %}
 5402   ins_pipe( pipe_slow );
 5403 %}
 5404 
 5405 
 5406 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5407   predicate((UseAVX > 0) &&
 5408             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5409   match(Set dst (AddVI src (LoadVector mem)));
 5410   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5411   ins_encode %{
 5412     int vlen_enc = vector_length_encoding(this);
 5413     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5414   %}
 5415   ins_pipe( pipe_slow );
 5416 %}
 5417 
 5418 // Longs vector add
 5419 instruct vaddL(vec dst, vec src) %{
 5420   predicate(UseAVX == 0);
 5421   match(Set dst (AddVL dst src));
 5422   format %{ "paddq   $dst,$src\t! add packedL" %}
 5423   ins_encode %{
 5424     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5425   %}
 5426   ins_pipe( pipe_slow );
 5427 %}
 5428 
 5429 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5430   predicate(UseAVX > 0);
 5431   match(Set dst (AddVL src1 src2));
 5432   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5433   ins_encode %{
 5434     int vlen_enc = vector_length_encoding(this);
 5435     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5436   %}
 5437   ins_pipe( pipe_slow );
 5438 %}
 5439 
 5440 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5441   predicate((UseAVX > 0) &&
 5442             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5443   match(Set dst (AddVL src (LoadVector mem)));
 5444   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5445   ins_encode %{
 5446     int vlen_enc = vector_length_encoding(this);
 5447     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5448   %}
 5449   ins_pipe( pipe_slow );
 5450 %}
 5451 
 5452 // Floats vector add
 5453 instruct vaddF(vec dst, vec src) %{
 5454   predicate(UseAVX == 0);
 5455   match(Set dst (AddVF dst src));
 5456   format %{ "addps   $dst,$src\t! add packedF" %}
 5457   ins_encode %{
 5458     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5459   %}
 5460   ins_pipe( pipe_slow );
 5461 %}
 5462 
 5463 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5464   predicate(UseAVX > 0);
 5465   match(Set dst (AddVF src1 src2));
 5466   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5467   ins_encode %{
 5468     int vlen_enc = vector_length_encoding(this);
 5469     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5470   %}
 5471   ins_pipe( pipe_slow );
 5472 %}
 5473 
 5474 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5475   predicate((UseAVX > 0) &&
 5476             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5477   match(Set dst (AddVF src (LoadVector mem)));
 5478   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5479   ins_encode %{
 5480     int vlen_enc = vector_length_encoding(this);
 5481     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5482   %}
 5483   ins_pipe( pipe_slow );
 5484 %}
 5485 
 5486 // Doubles vector add
 5487 instruct vaddD(vec dst, vec src) %{
 5488   predicate(UseAVX == 0);
 5489   match(Set dst (AddVD dst src));
 5490   format %{ "addpd   $dst,$src\t! add packedD" %}
 5491   ins_encode %{
 5492     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5493   %}
 5494   ins_pipe( pipe_slow );
 5495 %}
 5496 
 5497 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5498   predicate(UseAVX > 0);
 5499   match(Set dst (AddVD src1 src2));
 5500   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5501   ins_encode %{
 5502     int vlen_enc = vector_length_encoding(this);
 5503     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5504   %}
 5505   ins_pipe( pipe_slow );
 5506 %}
 5507 
 5508 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5509   predicate((UseAVX > 0) &&
 5510             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5511   match(Set dst (AddVD src (LoadVector mem)));
 5512   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5513   ins_encode %{
 5514     int vlen_enc = vector_length_encoding(this);
 5515     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5516   %}
 5517   ins_pipe( pipe_slow );
 5518 %}
 5519 
 5520 // --------------------------------- SUB --------------------------------------
 5521 
 5522 // Bytes vector sub
 5523 instruct vsubB(vec dst, vec src) %{
 5524   predicate(UseAVX == 0);
 5525   match(Set dst (SubVB dst src));
 5526   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5527   ins_encode %{
 5528     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5529   %}
 5530   ins_pipe( pipe_slow );
 5531 %}
 5532 
 5533 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5534   predicate(UseAVX > 0);
 5535   match(Set dst (SubVB src1 src2));
 5536   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5537   ins_encode %{
 5538     int vlen_enc = vector_length_encoding(this);
 5539     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5540   %}
 5541   ins_pipe( pipe_slow );
 5542 %}
 5543 
 5544 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5545   predicate((UseAVX > 0) &&
 5546             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5547   match(Set dst (SubVB src (LoadVector mem)));
 5548   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5549   ins_encode %{
 5550     int vlen_enc = vector_length_encoding(this);
 5551     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5552   %}
 5553   ins_pipe( pipe_slow );
 5554 %}
 5555 
 5556 // Shorts/Chars vector sub
 5557 instruct vsubS(vec dst, vec src) %{
 5558   predicate(UseAVX == 0);
 5559   match(Set dst (SubVS dst src));
 5560   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5561   ins_encode %{
 5562     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5563   %}
 5564   ins_pipe( pipe_slow );
 5565 %}
 5566 
 5567 
 5568 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5569   predicate(UseAVX > 0);
 5570   match(Set dst (SubVS src1 src2));
 5571   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5572   ins_encode %{
 5573     int vlen_enc = vector_length_encoding(this);
 5574     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5575   %}
 5576   ins_pipe( pipe_slow );
 5577 %}
 5578 
 5579 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5580   predicate((UseAVX > 0) &&
 5581             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5582   match(Set dst (SubVS src (LoadVector mem)));
 5583   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5584   ins_encode %{
 5585     int vlen_enc = vector_length_encoding(this);
 5586     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5587   %}
 5588   ins_pipe( pipe_slow );
 5589 %}
 5590 
 5591 // Integers vector sub
 5592 instruct vsubI(vec dst, vec src) %{
 5593   predicate(UseAVX == 0);
 5594   match(Set dst (SubVI dst src));
 5595   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5596   ins_encode %{
 5597     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5598   %}
 5599   ins_pipe( pipe_slow );
 5600 %}
 5601 
 5602 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5603   predicate(UseAVX > 0);
 5604   match(Set dst (SubVI src1 src2));
 5605   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5606   ins_encode %{
 5607     int vlen_enc = vector_length_encoding(this);
 5608     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5609   %}
 5610   ins_pipe( pipe_slow );
 5611 %}
 5612 
 5613 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5614   predicate((UseAVX > 0) &&
 5615             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5616   match(Set dst (SubVI src (LoadVector mem)));
 5617   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5618   ins_encode %{
 5619     int vlen_enc = vector_length_encoding(this);
 5620     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5621   %}
 5622   ins_pipe( pipe_slow );
 5623 %}
 5624 
 5625 // Longs vector sub
 5626 instruct vsubL(vec dst, vec src) %{
 5627   predicate(UseAVX == 0);
 5628   match(Set dst (SubVL dst src));
 5629   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5630   ins_encode %{
 5631     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5632   %}
 5633   ins_pipe( pipe_slow );
 5634 %}
 5635 
 5636 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5637   predicate(UseAVX > 0);
 5638   match(Set dst (SubVL src1 src2));
 5639   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5640   ins_encode %{
 5641     int vlen_enc = vector_length_encoding(this);
 5642     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5643   %}
 5644   ins_pipe( pipe_slow );
 5645 %}
 5646 
 5647 
 5648 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5649   predicate((UseAVX > 0) &&
 5650             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5651   match(Set dst (SubVL src (LoadVector mem)));
 5652   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5653   ins_encode %{
 5654     int vlen_enc = vector_length_encoding(this);
 5655     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5656   %}
 5657   ins_pipe( pipe_slow );
 5658 %}
 5659 
 5660 // Floats vector sub
 5661 instruct vsubF(vec dst, vec src) %{
 5662   predicate(UseAVX == 0);
 5663   match(Set dst (SubVF dst src));
 5664   format %{ "subps   $dst,$src\t! sub packedF" %}
 5665   ins_encode %{
 5666     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5667   %}
 5668   ins_pipe( pipe_slow );
 5669 %}
 5670 
 5671 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5672   predicate(UseAVX > 0);
 5673   match(Set dst (SubVF src1 src2));
 5674   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5675   ins_encode %{
 5676     int vlen_enc = vector_length_encoding(this);
 5677     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5678   %}
 5679   ins_pipe( pipe_slow );
 5680 %}
 5681 
 5682 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5683   predicate((UseAVX > 0) &&
 5684             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5685   match(Set dst (SubVF src (LoadVector mem)));
 5686   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5687   ins_encode %{
 5688     int vlen_enc = vector_length_encoding(this);
 5689     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5690   %}
 5691   ins_pipe( pipe_slow );
 5692 %}
 5693 
 5694 // Doubles vector sub
 5695 instruct vsubD(vec dst, vec src) %{
 5696   predicate(UseAVX == 0);
 5697   match(Set dst (SubVD dst src));
 5698   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5699   ins_encode %{
 5700     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5701   %}
 5702   ins_pipe( pipe_slow );
 5703 %}
 5704 
 5705 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5706   predicate(UseAVX > 0);
 5707   match(Set dst (SubVD src1 src2));
 5708   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5709   ins_encode %{
 5710     int vlen_enc = vector_length_encoding(this);
 5711     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5712   %}
 5713   ins_pipe( pipe_slow );
 5714 %}
 5715 
 5716 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5717   predicate((UseAVX > 0) &&
 5718             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5719   match(Set dst (SubVD src (LoadVector mem)));
 5720   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5721   ins_encode %{
 5722     int vlen_enc = vector_length_encoding(this);
 5723     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5724   %}
 5725   ins_pipe( pipe_slow );
 5726 %}
 5727 
 5728 // --------------------------------- MUL --------------------------------------
 5729 
 5730 // Byte vector mul
 5731 instruct vmul8B(vec dst, vec src1, vec src2, vec xtmp) %{
 5732   predicate(Matcher::vector_length_in_bytes(n) <= 8);
 5733   match(Set dst (MulVB src1 src2));
 5734   effect(TEMP dst, TEMP xtmp);
 5735   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5736   ins_encode %{
 5737     assert(UseSSE > 3, "required");
 5738     __ pmovsxbw($dst$$XMMRegister, $src1$$XMMRegister);
 5739     __ pmovsxbw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5740     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5741     __ psllw($dst$$XMMRegister, 8);
 5742     __ psrlw($dst$$XMMRegister, 8);
 5743     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5744   %}
 5745   ins_pipe( pipe_slow );
 5746 %}
 5747 
 5748 instruct vmulB(vec dst, vec src1, vec src2, vec xtmp) %{
 5749   predicate(UseAVX == 0 && Matcher::vector_length_in_bytes(n) > 8);
 5750   match(Set dst (MulVB src1 src2));
 5751   effect(TEMP dst, TEMP xtmp);
 5752   format %{ "mulVB   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5753   ins_encode %{
 5754     assert(UseSSE > 3, "required");
 5755     // Odd-index elements
 5756     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
 5757     __ psrlw($dst$$XMMRegister, 8);
 5758     __ movdqu($xtmp$$XMMRegister, $src2$$XMMRegister);
 5759     __ psrlw($xtmp$$XMMRegister, 8);
 5760     __ pmullw($dst$$XMMRegister, $xtmp$$XMMRegister);
 5761     __ psllw($dst$$XMMRegister, 8);
 5762     // Even-index elements
 5763     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5764     __ pmullw($xtmp$$XMMRegister, $src2$$XMMRegister);
 5765     __ psllw($xtmp$$XMMRegister, 8);
 5766     __ psrlw($xtmp$$XMMRegister, 8);
 5767     // Combine
 5768     __ por($dst$$XMMRegister, $xtmp$$XMMRegister);
 5769   %}
 5770   ins_pipe( pipe_slow );
 5771 %}
 5772 
 5773 instruct vmulB_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5774   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) > 8);
 5775   match(Set dst (MulVB src1 src2));
 5776   effect(TEMP xtmp1, TEMP xtmp2);
 5777   format %{ "vmulVB  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5778   ins_encode %{
 5779     int vlen_enc = vector_length_encoding(this);
 5780     // Odd-index elements
 5781     __ vpsrlw($xtmp2$$XMMRegister, $src1$$XMMRegister, 8, vlen_enc);
 5782     __ vpsrlw($xtmp1$$XMMRegister, $src2$$XMMRegister, 8, vlen_enc);
 5783     __ vpmullw($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5784     __ vpsllw($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 8, vlen_enc);
 5785     // Even-index elements
 5786     __ vpmullw($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5787     __ vpsllw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5788     __ vpsrlw($xtmp1$$XMMRegister, $xtmp1$$XMMRegister, 8, vlen_enc);
 5789     // Combine
 5790     __ vpor($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5791   %}
 5792   ins_pipe( pipe_slow );
 5793 %}
 5794 
 5795 // Shorts/Chars vector mul
 5796 instruct vmulS(vec dst, vec src) %{
 5797   predicate(UseAVX == 0);
 5798   match(Set dst (MulVS dst src));
 5799   format %{ "pmullw  $dst,$src\t! mul packedS" %}
 5800   ins_encode %{
 5801     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5802   %}
 5803   ins_pipe( pipe_slow );
 5804 %}
 5805 
 5806 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5807   predicate(UseAVX > 0);
 5808   match(Set dst (MulVS src1 src2));
 5809   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5810   ins_encode %{
 5811     int vlen_enc = vector_length_encoding(this);
 5812     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5813   %}
 5814   ins_pipe( pipe_slow );
 5815 %}
 5816 
 5817 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5818   predicate((UseAVX > 0) &&
 5819             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5820   match(Set dst (MulVS src (LoadVector mem)));
 5821   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5822   ins_encode %{
 5823     int vlen_enc = vector_length_encoding(this);
 5824     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5825   %}
 5826   ins_pipe( pipe_slow );
 5827 %}
 5828 
 5829 // Integers vector mul
 5830 instruct vmulI(vec dst, vec src) %{
 5831   predicate(UseAVX == 0);
 5832   match(Set dst (MulVI dst src));
 5833   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5834   ins_encode %{
 5835     assert(UseSSE > 3, "required");
 5836     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5837   %}
 5838   ins_pipe( pipe_slow );
 5839 %}
 5840 
 5841 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5842   predicate(UseAVX > 0);
 5843   match(Set dst (MulVI src1 src2));
 5844   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5845   ins_encode %{
 5846     int vlen_enc = vector_length_encoding(this);
 5847     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5848   %}
 5849   ins_pipe( pipe_slow );
 5850 %}
 5851 
 5852 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5853   predicate((UseAVX > 0) &&
 5854             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5855   match(Set dst (MulVI src (LoadVector mem)));
 5856   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5857   ins_encode %{
 5858     int vlen_enc = vector_length_encoding(this);
 5859     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5860   %}
 5861   ins_pipe( pipe_slow );
 5862 %}
 5863 
 5864 // Longs vector mul
 5865 instruct evmulL_reg(vec dst, vec src1, vec src2) %{
 5866   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5867              VM_Version::supports_avx512dq()) ||
 5868             VM_Version::supports_avx512vldq());
 5869   match(Set dst (MulVL src1 src2));
 5870   format %{ "evpmullq $dst,$src1,$src2\t! mul packedL" %}
 5871   ins_encode %{
 5872     assert(UseAVX > 2, "required");
 5873     int vlen_enc = vector_length_encoding(this);
 5874     __ evpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5875   %}
 5876   ins_pipe( pipe_slow );
 5877 %}
 5878 
 5879 instruct evmulL_mem(vec dst, vec src, memory mem) %{
 5880   predicate((Matcher::vector_length_in_bytes(n) == 64 &&
 5881              VM_Version::supports_avx512dq()) ||
 5882             (Matcher::vector_length_in_bytes(n) > 8 &&
 5883              VM_Version::supports_avx512vldq()));
 5884   match(Set dst (MulVL src (LoadVector mem)));
 5885   format %{ "evpmullq $dst,$src,$mem\t! mul packedL" %}
 5886   ins_encode %{
 5887     assert(UseAVX > 2, "required");
 5888     int vlen_enc = vector_length_encoding(this);
 5889     __ evpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5890   %}
 5891   ins_pipe( pipe_slow );
 5892 %}
 5893 
 5894 instruct vmulL(vec dst, vec src1, vec src2, vec xtmp) %{
 5895   predicate(UseAVX == 0);
 5896   match(Set dst (MulVL src1 src2));
 5897   effect(TEMP dst, TEMP xtmp);
 5898   format %{ "mulVL   $dst, $src1, $src2\t! using $xtmp as TEMP" %}
 5899   ins_encode %{
 5900     assert(VM_Version::supports_sse4_1(), "required");
 5901     // Get the lo-hi products, only the lower 32 bits is in concerns
 5902     __ pshufd($xtmp$$XMMRegister, $src2$$XMMRegister, 0xB1);
 5903     __ pmulld($xtmp$$XMMRegister, $src1$$XMMRegister);
 5904     __ pshufd($dst$$XMMRegister, $xtmp$$XMMRegister, 0xB1);
 5905     __ paddd($dst$$XMMRegister, $xtmp$$XMMRegister);
 5906     __ psllq($dst$$XMMRegister, 32);
 5907     // Get the lo-lo products
 5908     __ movdqu($xtmp$$XMMRegister, $src1$$XMMRegister);
 5909     __ pmuludq($xtmp$$XMMRegister, $src2$$XMMRegister);
 5910     __ paddq($dst$$XMMRegister, $xtmp$$XMMRegister);
 5911   %}
 5912   ins_pipe( pipe_slow );
 5913 %}
 5914 
 5915 instruct vmulL_reg(vec dst, vec src1, vec src2, vec xtmp1, vec xtmp2) %{
 5916   predicate(UseAVX > 0 &&
 5917             ((Matcher::vector_length_in_bytes(n) == 64 &&
 5918               !VM_Version::supports_avx512dq()) ||
 5919              (Matcher::vector_length_in_bytes(n) < 64 &&
 5920               !VM_Version::supports_avx512vldq())));
 5921   match(Set dst (MulVL src1 src2));
 5922   effect(TEMP xtmp1, TEMP xtmp2);
 5923   format %{ "vmulVL  $dst, $src1, $src2\t! using $xtmp1, $xtmp2 as TEMP" %}
 5924   ins_encode %{
 5925     int vlen_enc = vector_length_encoding(this);
 5926     // Get the lo-hi products, only the lower 32 bits is in concerns
 5927     __ vpshufd($xtmp1$$XMMRegister, $src2$$XMMRegister, 0xB1, vlen_enc);
 5928     __ vpmulld($xtmp1$$XMMRegister, $src1$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5929     __ vpshufd($xtmp2$$XMMRegister, $xtmp1$$XMMRegister, 0xB1, vlen_enc);
 5930     __ vpaddd($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, $xtmp1$$XMMRegister, vlen_enc);
 5931     __ vpsllq($xtmp2$$XMMRegister, $xtmp2$$XMMRegister, 32, vlen_enc);
 5932     // Get the lo-lo products
 5933     __ vpmuludq($xtmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5934     __ vpaddq($dst$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, vlen_enc);
 5935   %}
 5936   ins_pipe( pipe_slow );
 5937 %}
 5938 
 5939 // Floats vector mul
 5940 instruct vmulF(vec dst, vec src) %{
 5941   predicate(UseAVX == 0);
 5942   match(Set dst (MulVF dst src));
 5943   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5944   ins_encode %{
 5945     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5946   %}
 5947   ins_pipe( pipe_slow );
 5948 %}
 5949 
 5950 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5951   predicate(UseAVX > 0);
 5952   match(Set dst (MulVF src1 src2));
 5953   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5954   ins_encode %{
 5955     int vlen_enc = vector_length_encoding(this);
 5956     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5957   %}
 5958   ins_pipe( pipe_slow );
 5959 %}
 5960 
 5961 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5962   predicate((UseAVX > 0) &&
 5963             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5964   match(Set dst (MulVF src (LoadVector mem)));
 5965   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5966   ins_encode %{
 5967     int vlen_enc = vector_length_encoding(this);
 5968     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5969   %}
 5970   ins_pipe( pipe_slow );
 5971 %}
 5972 
 5973 // Doubles vector mul
 5974 instruct vmulD(vec dst, vec src) %{
 5975   predicate(UseAVX == 0);
 5976   match(Set dst (MulVD dst src));
 5977   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5978   ins_encode %{
 5979     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5980   %}
 5981   ins_pipe( pipe_slow );
 5982 %}
 5983 
 5984 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5985   predicate(UseAVX > 0);
 5986   match(Set dst (MulVD src1 src2));
 5987   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5988   ins_encode %{
 5989     int vlen_enc = vector_length_encoding(this);
 5990     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5991   %}
 5992   ins_pipe( pipe_slow );
 5993 %}
 5994 
 5995 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5996   predicate((UseAVX > 0) &&
 5997             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5998   match(Set dst (MulVD src (LoadVector mem)));
 5999   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 6000   ins_encode %{
 6001     int vlen_enc = vector_length_encoding(this);
 6002     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6003   %}
 6004   ins_pipe( pipe_slow );
 6005 %}
 6006 
 6007 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 6008   predicate(Matcher::vector_length(n) == 8);
 6009   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
 6010   effect(TEMP dst, USE src1, USE src2);
 6011   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
 6012             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
 6013          %}
 6014   ins_encode %{
 6015     assert(UseAVX > 0, "required");
 6016 
 6017     int vlen_enc = Assembler::AVX_256bit;
 6018     int cond = (Assembler::Condition)($copnd$$cmpcode);
 6019     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 6020     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6021   %}
 6022   ins_pipe( pipe_slow );
 6023 %}
 6024 
 6025 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 6026   predicate(Matcher::vector_length(n) == 4);
 6027   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
 6028   effect(TEMP dst, USE src1, USE src2);
 6029   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
 6030             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
 6031          %}
 6032   ins_encode %{
 6033     assert(UseAVX > 0, "required");
 6034 
 6035     int vlen_enc = Assembler::AVX_256bit;
 6036     int cond = (Assembler::Condition)($copnd$$cmpcode);
 6037     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 6038     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6039   %}
 6040   ins_pipe( pipe_slow );
 6041 %}
 6042 
 6043 // --------------------------------- DIV --------------------------------------
 6044 
 6045 // Floats vector div
 6046 instruct vdivF(vec dst, vec src) %{
 6047   predicate(UseAVX == 0);
 6048   match(Set dst (DivVF dst src));
 6049   format %{ "divps   $dst,$src\t! div packedF" %}
 6050   ins_encode %{
 6051     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6052   %}
 6053   ins_pipe( pipe_slow );
 6054 %}
 6055 
 6056 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6057   predicate(UseAVX > 0);
 6058   match(Set dst (DivVF src1 src2));
 6059   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6060   ins_encode %{
 6061     int vlen_enc = vector_length_encoding(this);
 6062     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6063   %}
 6064   ins_pipe( pipe_slow );
 6065 %}
 6066 
 6067 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6068   predicate((UseAVX > 0) &&
 6069             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6070   match(Set dst (DivVF src (LoadVector mem)));
 6071   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6072   ins_encode %{
 6073     int vlen_enc = vector_length_encoding(this);
 6074     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6075   %}
 6076   ins_pipe( pipe_slow );
 6077 %}
 6078 
 6079 // Doubles vector div
 6080 instruct vdivD(vec dst, vec src) %{
 6081   predicate(UseAVX == 0);
 6082   match(Set dst (DivVD dst src));
 6083   format %{ "divpd   $dst,$src\t! div packedD" %}
 6084   ins_encode %{
 6085     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6086   %}
 6087   ins_pipe( pipe_slow );
 6088 %}
 6089 
 6090 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6091   predicate(UseAVX > 0);
 6092   match(Set dst (DivVD src1 src2));
 6093   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6094   ins_encode %{
 6095     int vlen_enc = vector_length_encoding(this);
 6096     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6097   %}
 6098   ins_pipe( pipe_slow );
 6099 %}
 6100 
 6101 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6102   predicate((UseAVX > 0) &&
 6103             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6104   match(Set dst (DivVD src (LoadVector mem)));
 6105   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6106   ins_encode %{
 6107     int vlen_enc = vector_length_encoding(this);
 6108     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6109   %}
 6110   ins_pipe( pipe_slow );
 6111 %}
 6112 
 6113 // ------------------------------ MinMax ---------------------------------------
 6114 
 6115 // Byte, Short, Int vector Min/Max
 6116 instruct minmax_reg_sse(vec dst, vec src) %{
 6117   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6118             UseAVX == 0);
 6119   match(Set dst (MinV dst src));
 6120   match(Set dst (MaxV dst src));
 6121   format %{ "vector_minmax  $dst,$src\t!  " %}
 6122   ins_encode %{
 6123     assert(UseSSE >= 4, "required");
 6124 
 6125     int opcode = this->ideal_Opcode();
 6126     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6127     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6128   %}
 6129   ins_pipe( pipe_slow );
 6130 %}
 6131 
 6132 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6133   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6134             UseAVX > 0);
 6135   match(Set dst (MinV src1 src2));
 6136   match(Set dst (MaxV src1 src2));
 6137   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6138   ins_encode %{
 6139     int opcode = this->ideal_Opcode();
 6140     int vlen_enc = vector_length_encoding(this);
 6141     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6142 
 6143     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6144   %}
 6145   ins_pipe( pipe_slow );
 6146 %}
 6147 
 6148 // Long vector Min/Max
 6149 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6150   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6151             UseAVX == 0);
 6152   match(Set dst (MinV dst src));
 6153   match(Set dst (MaxV src dst));
 6154   effect(TEMP dst, TEMP tmp);
 6155   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6156   ins_encode %{
 6157     assert(UseSSE >= 4, "required");
 6158 
 6159     int opcode = this->ideal_Opcode();
 6160     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6161     assert(elem_bt == T_LONG, "sanity");
 6162 
 6163     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6164   %}
 6165   ins_pipe( pipe_slow );
 6166 %}
 6167 
 6168 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6169   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6170             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6171   match(Set dst (MinV src1 src2));
 6172   match(Set dst (MaxV src1 src2));
 6173   effect(TEMP dst);
 6174   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6175   ins_encode %{
 6176     int vlen_enc = vector_length_encoding(this);
 6177     int opcode = this->ideal_Opcode();
 6178     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6179     assert(elem_bt == T_LONG, "sanity");
 6180 
 6181     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6182   %}
 6183   ins_pipe( pipe_slow );
 6184 %}
 6185 
 6186 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6187   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6188             Matcher::vector_element_basic_type(n) == T_LONG);
 6189   match(Set dst (MinV src1 src2));
 6190   match(Set dst (MaxV src1 src2));
 6191   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6192   ins_encode %{
 6193     assert(UseAVX > 2, "required");
 6194 
 6195     int vlen_enc = vector_length_encoding(this);
 6196     int opcode = this->ideal_Opcode();
 6197     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6198     assert(elem_bt == T_LONG, "sanity");
 6199 
 6200     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6201   %}
 6202   ins_pipe( pipe_slow );
 6203 %}
 6204 
 6205 // Float/Double vector Min/Max
 6206 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6207   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6208             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6209             UseAVX > 0);
 6210   match(Set dst (MinV a b));
 6211   match(Set dst (MaxV a b));
 6212   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6213   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6214   ins_encode %{
 6215     assert(UseAVX > 0, "required");
 6216 
 6217     int opcode = this->ideal_Opcode();
 6218     int vlen_enc = vector_length_encoding(this);
 6219     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6220 
 6221     __ vminmax_fp(opcode, elem_bt,
 6222                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6223                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6224   %}
 6225   ins_pipe( pipe_slow );
 6226 %}
 6227 
 6228 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6229   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6230             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6231   match(Set dst (MinV a b));
 6232   match(Set dst (MaxV a b));
 6233   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6234   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6235   ins_encode %{
 6236     assert(UseAVX > 2, "required");
 6237 
 6238     int opcode = this->ideal_Opcode();
 6239     int vlen_enc = vector_length_encoding(this);
 6240     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6241 
 6242     __ evminmax_fp(opcode, elem_bt,
 6243                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6244                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6245   %}
 6246   ins_pipe( pipe_slow );
 6247 %}
 6248 
 6249 // --------------------------------- Signum/CopySign ---------------------------
 6250 
 6251 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6252   match(Set dst (SignumF dst (Binary zero one)));
 6253   effect(KILL cr);
 6254   format %{ "signumF $dst, $dst" %}
 6255   ins_encode %{
 6256     int opcode = this->ideal_Opcode();
 6257     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6258   %}
 6259   ins_pipe( pipe_slow );
 6260 %}
 6261 
 6262 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6263   match(Set dst (SignumD dst (Binary zero one)));
 6264   effect(KILL cr);
 6265   format %{ "signumD $dst, $dst" %}
 6266   ins_encode %{
 6267     int opcode = this->ideal_Opcode();
 6268     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6269   %}
 6270   ins_pipe( pipe_slow );
 6271 %}
 6272 
 6273 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6274   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6275   match(Set dst (SignumVF src (Binary zero one)));
 6276   match(Set dst (SignumVD src (Binary zero one)));
 6277   effect(TEMP dst, TEMP xtmp1);
 6278   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6279   ins_encode %{
 6280     int opcode = this->ideal_Opcode();
 6281     int vec_enc = vector_length_encoding(this);
 6282     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6283                          $xtmp1$$XMMRegister, vec_enc);
 6284   %}
 6285   ins_pipe( pipe_slow );
 6286 %}
 6287 
 6288 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6289   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6290   match(Set dst (SignumVF src (Binary zero one)));
 6291   match(Set dst (SignumVD src (Binary zero one)));
 6292   effect(TEMP dst, TEMP ktmp1);
 6293   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6294   ins_encode %{
 6295     int opcode = this->ideal_Opcode();
 6296     int vec_enc = vector_length_encoding(this);
 6297     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6298                           $ktmp1$$KRegister, vec_enc);
 6299   %}
 6300   ins_pipe( pipe_slow );
 6301 %}
 6302 
 6303 // ---------------------------------------
 6304 // For copySign use 0xE4 as writemask for vpternlog
 6305 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6306 // C (xmm2) is set to 0x7FFFFFFF
 6307 // Wherever xmm2 is 0, we want to pick from B (sign)
 6308 // Wherever xmm2 is 1, we want to pick from A (src)
 6309 //
 6310 // A B C Result
 6311 // 0 0 0 0
 6312 // 0 0 1 0
 6313 // 0 1 0 1
 6314 // 0 1 1 0
 6315 // 1 0 0 0
 6316 // 1 0 1 1
 6317 // 1 1 0 1
 6318 // 1 1 1 1
 6319 //
 6320 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6321 // ---------------------------------------
 6322 
 6323 #ifdef _LP64
 6324 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6325   match(Set dst (CopySignF dst src));
 6326   effect(TEMP tmp1, TEMP tmp2);
 6327   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6328   ins_encode %{
 6329     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6330     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6331     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6332   %}
 6333   ins_pipe( pipe_slow );
 6334 %}
 6335 
 6336 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6337   match(Set dst (CopySignD dst (Binary src zero)));
 6338   ins_cost(100);
 6339   effect(TEMP tmp1, TEMP tmp2);
 6340   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6341   ins_encode %{
 6342     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6343     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6344     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6345   %}
 6346   ins_pipe( pipe_slow );
 6347 %}
 6348 
 6349 #endif // _LP64
 6350 
 6351 //----------------------------- CompressBits/ExpandBits ------------------------
 6352 
 6353 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6354   predicate(n->bottom_type()->isa_int());
 6355   match(Set dst (CompressBits src mask));
 6356   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6357   ins_encode %{
 6358     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6359   %}
 6360   ins_pipe( pipe_slow );
 6361 %}
 6362 
 6363 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6364   predicate(n->bottom_type()->isa_int());
 6365   match(Set dst (ExpandBits src mask));
 6366   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6367   ins_encode %{
 6368     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6369   %}
 6370   ins_pipe( pipe_slow );
 6371 %}
 6372 
 6373 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6374   predicate(n->bottom_type()->isa_int());
 6375   match(Set dst (CompressBits src (LoadI mask)));
 6376   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6377   ins_encode %{
 6378     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6379   %}
 6380   ins_pipe( pipe_slow );
 6381 %}
 6382 
 6383 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6384   predicate(n->bottom_type()->isa_int());
 6385   match(Set dst (ExpandBits src (LoadI mask)));
 6386   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6387   ins_encode %{
 6388     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6389   %}
 6390   ins_pipe( pipe_slow );
 6391 %}
 6392 
 6393 // --------------------------------- Sqrt --------------------------------------
 6394 
 6395 instruct vsqrtF_reg(vec dst, vec src) %{
 6396   match(Set dst (SqrtVF src));
 6397   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6398   ins_encode %{
 6399     assert(UseAVX > 0, "required");
 6400     int vlen_enc = vector_length_encoding(this);
 6401     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6402   %}
 6403   ins_pipe( pipe_slow );
 6404 %}
 6405 
 6406 instruct vsqrtF_mem(vec dst, memory mem) %{
 6407   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6408   match(Set dst (SqrtVF (LoadVector mem)));
 6409   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6410   ins_encode %{
 6411     assert(UseAVX > 0, "required");
 6412     int vlen_enc = vector_length_encoding(this);
 6413     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6414   %}
 6415   ins_pipe( pipe_slow );
 6416 %}
 6417 
 6418 // Floating point vector sqrt
 6419 instruct vsqrtD_reg(vec dst, vec src) %{
 6420   match(Set dst (SqrtVD src));
 6421   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6422   ins_encode %{
 6423     assert(UseAVX > 0, "required");
 6424     int vlen_enc = vector_length_encoding(this);
 6425     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6426   %}
 6427   ins_pipe( pipe_slow );
 6428 %}
 6429 
 6430 instruct vsqrtD_mem(vec dst, memory mem) %{
 6431   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6432   match(Set dst (SqrtVD (LoadVector mem)));
 6433   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6434   ins_encode %{
 6435     assert(UseAVX > 0, "required");
 6436     int vlen_enc = vector_length_encoding(this);
 6437     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6438   %}
 6439   ins_pipe( pipe_slow );
 6440 %}
 6441 
 6442 // ------------------------------ Shift ---------------------------------------
 6443 
 6444 // Left and right shift count vectors are the same on x86
 6445 // (only lowest bits of xmm reg are used for count).
 6446 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6447   match(Set dst (LShiftCntV cnt));
 6448   match(Set dst (RShiftCntV cnt));
 6449   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6450   ins_encode %{
 6451     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6452   %}
 6453   ins_pipe( pipe_slow );
 6454 %}
 6455 
 6456 // Byte vector shift
 6457 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6458   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6459   match(Set dst ( LShiftVB src shift));
 6460   match(Set dst ( RShiftVB src shift));
 6461   match(Set dst (URShiftVB src shift));
 6462   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6463   format %{"vector_byte_shift $dst,$src,$shift" %}
 6464   ins_encode %{
 6465     assert(UseSSE > 3, "required");
 6466     int opcode = this->ideal_Opcode();
 6467     bool sign = (opcode != Op_URShiftVB);
 6468     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6469     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6470     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6471     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6472     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6473   %}
 6474   ins_pipe( pipe_slow );
 6475 %}
 6476 
 6477 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6478   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6479             UseAVX <= 1);
 6480   match(Set dst ( LShiftVB src shift));
 6481   match(Set dst ( RShiftVB src shift));
 6482   match(Set dst (URShiftVB src shift));
 6483   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6484   format %{"vector_byte_shift $dst,$src,$shift" %}
 6485   ins_encode %{
 6486     assert(UseSSE > 3, "required");
 6487     int opcode = this->ideal_Opcode();
 6488     bool sign = (opcode != Op_URShiftVB);
 6489     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6490     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6491     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6492     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6493     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6494     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6495     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6496     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6497     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6498   %}
 6499   ins_pipe( pipe_slow );
 6500 %}
 6501 
 6502 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6503   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6504             UseAVX > 1);
 6505   match(Set dst ( LShiftVB src shift));
 6506   match(Set dst ( RShiftVB src shift));
 6507   match(Set dst (URShiftVB src shift));
 6508   effect(TEMP dst, TEMP tmp);
 6509   format %{"vector_byte_shift $dst,$src,$shift" %}
 6510   ins_encode %{
 6511     int opcode = this->ideal_Opcode();
 6512     bool sign = (opcode != Op_URShiftVB);
 6513     int vlen_enc = Assembler::AVX_256bit;
 6514     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6515     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6516     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6517     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6518     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6519   %}
 6520   ins_pipe( pipe_slow );
 6521 %}
 6522 
 6523 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6524   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6525   match(Set dst ( LShiftVB src shift));
 6526   match(Set dst ( RShiftVB src shift));
 6527   match(Set dst (URShiftVB src shift));
 6528   effect(TEMP dst, TEMP tmp);
 6529   format %{"vector_byte_shift $dst,$src,$shift" %}
 6530   ins_encode %{
 6531     assert(UseAVX > 1, "required");
 6532     int opcode = this->ideal_Opcode();
 6533     bool sign = (opcode != Op_URShiftVB);
 6534     int vlen_enc = Assembler::AVX_256bit;
 6535     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6536     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6537     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6538     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6539     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6540     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6541     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6542     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6543     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6544   %}
 6545   ins_pipe( pipe_slow );
 6546 %}
 6547 
 6548 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6549   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6550   match(Set dst ( LShiftVB src shift));
 6551   match(Set dst  (RShiftVB src shift));
 6552   match(Set dst (URShiftVB src shift));
 6553   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6554   format %{"vector_byte_shift $dst,$src,$shift" %}
 6555   ins_encode %{
 6556     assert(UseAVX > 2, "required");
 6557     int opcode = this->ideal_Opcode();
 6558     bool sign = (opcode != Op_URShiftVB);
 6559     int vlen_enc = Assembler::AVX_512bit;
 6560     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6561     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6562     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6563     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6564     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6565     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6566     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6567     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6568     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6569     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6570     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6571     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6572   %}
 6573   ins_pipe( pipe_slow );
 6574 %}
 6575 
 6576 // Shorts vector logical right shift produces incorrect Java result
 6577 // for negative data because java code convert short value into int with
 6578 // sign extension before a shift. But char vectors are fine since chars are
 6579 // unsigned values.
 6580 // Shorts/Chars vector left shift
 6581 instruct vshiftS(vec dst, vec src, vec shift) %{
 6582   predicate(!n->as_ShiftV()->is_var_shift());
 6583   match(Set dst ( LShiftVS src shift));
 6584   match(Set dst ( RShiftVS src shift));
 6585   match(Set dst (URShiftVS src shift));
 6586   effect(TEMP dst, USE src, USE shift);
 6587   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6588   ins_encode %{
 6589     int opcode = this->ideal_Opcode();
 6590     if (UseAVX > 0) {
 6591       int vlen_enc = vector_length_encoding(this);
 6592       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6593     } else {
 6594       int vlen = Matcher::vector_length(this);
 6595       if (vlen == 2) {
 6596         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6597         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6598       } else if (vlen == 4) {
 6599         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6600         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6601       } else {
 6602         assert (vlen == 8, "sanity");
 6603         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6604         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6605       }
 6606     }
 6607   %}
 6608   ins_pipe( pipe_slow );
 6609 %}
 6610 
 6611 // Integers vector left shift
 6612 instruct vshiftI(vec dst, vec src, vec shift) %{
 6613   predicate(!n->as_ShiftV()->is_var_shift());
 6614   match(Set dst ( LShiftVI src shift));
 6615   match(Set dst ( RShiftVI src shift));
 6616   match(Set dst (URShiftVI src shift));
 6617   effect(TEMP dst, USE src, USE shift);
 6618   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6619   ins_encode %{
 6620     int opcode = this->ideal_Opcode();
 6621     if (UseAVX > 0) {
 6622       int vlen_enc = vector_length_encoding(this);
 6623       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6624     } else {
 6625       int vlen = Matcher::vector_length(this);
 6626       if (vlen == 2) {
 6627         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6628         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6629       } else {
 6630         assert(vlen == 4, "sanity");
 6631         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6632         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6633       }
 6634     }
 6635   %}
 6636   ins_pipe( pipe_slow );
 6637 %}
 6638 
 6639 // Integers vector left constant shift
 6640 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6641   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6642   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6643   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6644   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6645   ins_encode %{
 6646     int opcode = this->ideal_Opcode();
 6647     if (UseAVX > 0) {
 6648       int vector_len = vector_length_encoding(this);
 6649       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6650     } else {
 6651       int vlen = Matcher::vector_length(this);
 6652       if (vlen == 2) {
 6653         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6654         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6655       } else {
 6656         assert(vlen == 4, "sanity");
 6657         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6658         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6659       }
 6660     }
 6661   %}
 6662   ins_pipe( pipe_slow );
 6663 %}
 6664 
 6665 // Longs vector shift
 6666 instruct vshiftL(vec dst, vec src, vec shift) %{
 6667   predicate(!n->as_ShiftV()->is_var_shift());
 6668   match(Set dst ( LShiftVL src shift));
 6669   match(Set dst (URShiftVL src shift));
 6670   effect(TEMP dst, USE src, USE shift);
 6671   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6672   ins_encode %{
 6673     int opcode = this->ideal_Opcode();
 6674     if (UseAVX > 0) {
 6675       int vlen_enc = vector_length_encoding(this);
 6676       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6677     } else {
 6678       assert(Matcher::vector_length(this) == 2, "");
 6679       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6680       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6681     }
 6682   %}
 6683   ins_pipe( pipe_slow );
 6684 %}
 6685 
 6686 // Longs vector constant shift
 6687 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6688   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6689   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6690   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6691   ins_encode %{
 6692     int opcode = this->ideal_Opcode();
 6693     if (UseAVX > 0) {
 6694       int vector_len = vector_length_encoding(this);
 6695       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6696     } else {
 6697       assert(Matcher::vector_length(this) == 2, "");
 6698       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6699       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6700     }
 6701   %}
 6702   ins_pipe( pipe_slow );
 6703 %}
 6704 
 6705 // -------------------ArithmeticRightShift -----------------------------------
 6706 // Long vector arithmetic right shift
 6707 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6708   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6709   match(Set dst (RShiftVL src shift));
 6710   effect(TEMP dst, TEMP tmp);
 6711   format %{ "vshiftq $dst,$src,$shift" %}
 6712   ins_encode %{
 6713     uint vlen = Matcher::vector_length(this);
 6714     if (vlen == 2) {
 6715       assert(UseSSE >= 2, "required");
 6716       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6717       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6718       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6719       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6720       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6721       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6722     } else {
 6723       assert(vlen == 4, "sanity");
 6724       assert(UseAVX > 1, "required");
 6725       int vlen_enc = Assembler::AVX_256bit;
 6726       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6727       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6728       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6729       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6730       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6731     }
 6732   %}
 6733   ins_pipe( pipe_slow );
 6734 %}
 6735 
 6736 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6737   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6738   match(Set dst (RShiftVL src shift));
 6739   format %{ "vshiftq $dst,$src,$shift" %}
 6740   ins_encode %{
 6741     int vlen_enc = vector_length_encoding(this);
 6742     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6743   %}
 6744   ins_pipe( pipe_slow );
 6745 %}
 6746 
 6747 // ------------------- Variable Shift -----------------------------
 6748 // Byte variable shift
 6749 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6750   predicate(Matcher::vector_length(n) <= 8 &&
 6751             n->as_ShiftV()->is_var_shift() &&
 6752             !VM_Version::supports_avx512bw());
 6753   match(Set dst ( LShiftVB src shift));
 6754   match(Set dst ( RShiftVB src shift));
 6755   match(Set dst (URShiftVB src shift));
 6756   effect(TEMP dst, TEMP vtmp);
 6757   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6758   ins_encode %{
 6759     assert(UseAVX >= 2, "required");
 6760 
 6761     int opcode = this->ideal_Opcode();
 6762     int vlen_enc = Assembler::AVX_128bit;
 6763     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6764     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6765   %}
 6766   ins_pipe( pipe_slow );
 6767 %}
 6768 
 6769 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6770   predicate(Matcher::vector_length(n) == 16 &&
 6771             n->as_ShiftV()->is_var_shift() &&
 6772             !VM_Version::supports_avx512bw());
 6773   match(Set dst ( LShiftVB src shift));
 6774   match(Set dst ( RShiftVB src shift));
 6775   match(Set dst (URShiftVB src shift));
 6776   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6777   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6778   ins_encode %{
 6779     assert(UseAVX >= 2, "required");
 6780 
 6781     int opcode = this->ideal_Opcode();
 6782     int vlen_enc = Assembler::AVX_128bit;
 6783     // Shift lower half and get word result in dst
 6784     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6785 
 6786     // Shift upper half and get word result in vtmp1
 6787     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6788     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6789     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6790 
 6791     // Merge and down convert the two word results to byte in dst
 6792     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6793   %}
 6794   ins_pipe( pipe_slow );
 6795 %}
 6796 
 6797 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6798   predicate(Matcher::vector_length(n) == 32 &&
 6799             n->as_ShiftV()->is_var_shift() &&
 6800             !VM_Version::supports_avx512bw());
 6801   match(Set dst ( LShiftVB src shift));
 6802   match(Set dst ( RShiftVB src shift));
 6803   match(Set dst (URShiftVB src shift));
 6804   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6805   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6806   ins_encode %{
 6807     assert(UseAVX >= 2, "required");
 6808 
 6809     int opcode = this->ideal_Opcode();
 6810     int vlen_enc = Assembler::AVX_128bit;
 6811     // Process lower 128 bits and get result in dst
 6812     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6813     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6814     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6815     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6816     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6817 
 6818     // Process higher 128 bits and get result in vtmp3
 6819     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6820     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6821     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6822     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6823     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6824     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6825     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6826 
 6827     // Merge the two results in dst
 6828     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6829   %}
 6830   ins_pipe( pipe_slow );
 6831 %}
 6832 
 6833 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6834   predicate(Matcher::vector_length(n) <= 32 &&
 6835             n->as_ShiftV()->is_var_shift() &&
 6836             VM_Version::supports_avx512bw());
 6837   match(Set dst ( LShiftVB src shift));
 6838   match(Set dst ( RShiftVB src shift));
 6839   match(Set dst (URShiftVB src shift));
 6840   effect(TEMP dst, TEMP vtmp);
 6841   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6842   ins_encode %{
 6843     assert(UseAVX > 2, "required");
 6844 
 6845     int opcode = this->ideal_Opcode();
 6846     int vlen_enc = vector_length_encoding(this);
 6847     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6848   %}
 6849   ins_pipe( pipe_slow );
 6850 %}
 6851 
 6852 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6853   predicate(Matcher::vector_length(n) == 64 &&
 6854             n->as_ShiftV()->is_var_shift() &&
 6855             VM_Version::supports_avx512bw());
 6856   match(Set dst ( LShiftVB src shift));
 6857   match(Set dst ( RShiftVB src shift));
 6858   match(Set dst (URShiftVB src shift));
 6859   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6860   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6861   ins_encode %{
 6862     assert(UseAVX > 2, "required");
 6863 
 6864     int opcode = this->ideal_Opcode();
 6865     int vlen_enc = Assembler::AVX_256bit;
 6866     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6867     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6868     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6869     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6870     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6871   %}
 6872   ins_pipe( pipe_slow );
 6873 %}
 6874 
 6875 // Short variable shift
 6876 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6877   predicate(Matcher::vector_length(n) <= 8 &&
 6878             n->as_ShiftV()->is_var_shift() &&
 6879             !VM_Version::supports_avx512bw());
 6880   match(Set dst ( LShiftVS src shift));
 6881   match(Set dst ( RShiftVS src shift));
 6882   match(Set dst (URShiftVS src shift));
 6883   effect(TEMP dst, TEMP vtmp);
 6884   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6885   ins_encode %{
 6886     assert(UseAVX >= 2, "required");
 6887 
 6888     int opcode = this->ideal_Opcode();
 6889     bool sign = (opcode != Op_URShiftVS);
 6890     int vlen_enc = Assembler::AVX_256bit;
 6891     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6892     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6893     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6894     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6895     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6896     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6897   %}
 6898   ins_pipe( pipe_slow );
 6899 %}
 6900 
 6901 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6902   predicate(Matcher::vector_length(n) == 16 &&
 6903             n->as_ShiftV()->is_var_shift() &&
 6904             !VM_Version::supports_avx512bw());
 6905   match(Set dst ( LShiftVS src shift));
 6906   match(Set dst ( RShiftVS src shift));
 6907   match(Set dst (URShiftVS src shift));
 6908   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6909   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6910   ins_encode %{
 6911     assert(UseAVX >= 2, "required");
 6912 
 6913     int opcode = this->ideal_Opcode();
 6914     bool sign = (opcode != Op_URShiftVS);
 6915     int vlen_enc = Assembler::AVX_256bit;
 6916     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6917     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6918     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6919     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6920     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6921 
 6922     // Shift upper half, with result in dst using vtmp1 as TEMP
 6923     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6924     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6925     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6926     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6927     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6928     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6929 
 6930     // Merge lower and upper half result into dst
 6931     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6932     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6933   %}
 6934   ins_pipe( pipe_slow );
 6935 %}
 6936 
 6937 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6938   predicate(n->as_ShiftV()->is_var_shift() &&
 6939             VM_Version::supports_avx512bw());
 6940   match(Set dst ( LShiftVS src shift));
 6941   match(Set dst ( RShiftVS src shift));
 6942   match(Set dst (URShiftVS src shift));
 6943   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6944   ins_encode %{
 6945     assert(UseAVX > 2, "required");
 6946 
 6947     int opcode = this->ideal_Opcode();
 6948     int vlen_enc = vector_length_encoding(this);
 6949     if (!VM_Version::supports_avx512vl()) {
 6950       vlen_enc = Assembler::AVX_512bit;
 6951     }
 6952     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6953   %}
 6954   ins_pipe( pipe_slow );
 6955 %}
 6956 
 6957 //Integer variable shift
 6958 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6959   predicate(n->as_ShiftV()->is_var_shift());
 6960   match(Set dst ( LShiftVI src shift));
 6961   match(Set dst ( RShiftVI src shift));
 6962   match(Set dst (URShiftVI src shift));
 6963   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6964   ins_encode %{
 6965     assert(UseAVX >= 2, "required");
 6966 
 6967     int opcode = this->ideal_Opcode();
 6968     int vlen_enc = vector_length_encoding(this);
 6969     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6970   %}
 6971   ins_pipe( pipe_slow );
 6972 %}
 6973 
 6974 //Long variable shift
 6975 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6976   predicate(n->as_ShiftV()->is_var_shift());
 6977   match(Set dst ( LShiftVL src shift));
 6978   match(Set dst (URShiftVL src shift));
 6979   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6980   ins_encode %{
 6981     assert(UseAVX >= 2, "required");
 6982 
 6983     int opcode = this->ideal_Opcode();
 6984     int vlen_enc = vector_length_encoding(this);
 6985     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6986   %}
 6987   ins_pipe( pipe_slow );
 6988 %}
 6989 
 6990 //Long variable right shift arithmetic
 6991 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6992   predicate(Matcher::vector_length(n) <= 4 &&
 6993             n->as_ShiftV()->is_var_shift() &&
 6994             UseAVX == 2);
 6995   match(Set dst (RShiftVL src shift));
 6996   effect(TEMP dst, TEMP vtmp);
 6997   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6998   ins_encode %{
 6999     int opcode = this->ideal_Opcode();
 7000     int vlen_enc = vector_length_encoding(this);
 7001     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 7002                  $vtmp$$XMMRegister);
 7003   %}
 7004   ins_pipe( pipe_slow );
 7005 %}
 7006 
 7007 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 7008   predicate(n->as_ShiftV()->is_var_shift() &&
 7009             UseAVX > 2);
 7010   match(Set dst (RShiftVL src shift));
 7011   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 7012   ins_encode %{
 7013     int opcode = this->ideal_Opcode();
 7014     int vlen_enc = vector_length_encoding(this);
 7015     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 7016   %}
 7017   ins_pipe( pipe_slow );
 7018 %}
 7019 
 7020 // --------------------------------- AND --------------------------------------
 7021 
 7022 instruct vand(vec dst, vec src) %{
 7023   predicate(UseAVX == 0);
 7024   match(Set dst (AndV dst src));
 7025   format %{ "pand    $dst,$src\t! and vectors" %}
 7026   ins_encode %{
 7027     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 7028   %}
 7029   ins_pipe( pipe_slow );
 7030 %}
 7031 
 7032 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7033   predicate(UseAVX > 0);
 7034   match(Set dst (AndV src1 src2));
 7035   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7036   ins_encode %{
 7037     int vlen_enc = vector_length_encoding(this);
 7038     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7039   %}
 7040   ins_pipe( pipe_slow );
 7041 %}
 7042 
 7043 instruct vand_mem(vec dst, vec src, memory mem) %{
 7044   predicate((UseAVX > 0) &&
 7045             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7046   match(Set dst (AndV src (LoadVector mem)));
 7047   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7048   ins_encode %{
 7049     int vlen_enc = vector_length_encoding(this);
 7050     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7051   %}
 7052   ins_pipe( pipe_slow );
 7053 %}
 7054 
 7055 // --------------------------------- OR ---------------------------------------
 7056 
 7057 instruct vor(vec dst, vec src) %{
 7058   predicate(UseAVX == 0);
 7059   match(Set dst (OrV dst src));
 7060   format %{ "por     $dst,$src\t! or vectors" %}
 7061   ins_encode %{
 7062     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7063   %}
 7064   ins_pipe( pipe_slow );
 7065 %}
 7066 
 7067 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7068   predicate(UseAVX > 0);
 7069   match(Set dst (OrV src1 src2));
 7070   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7071   ins_encode %{
 7072     int vlen_enc = vector_length_encoding(this);
 7073     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7074   %}
 7075   ins_pipe( pipe_slow );
 7076 %}
 7077 
 7078 instruct vor_mem(vec dst, vec src, memory mem) %{
 7079   predicate((UseAVX > 0) &&
 7080             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7081   match(Set dst (OrV src (LoadVector mem)));
 7082   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7083   ins_encode %{
 7084     int vlen_enc = vector_length_encoding(this);
 7085     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7086   %}
 7087   ins_pipe( pipe_slow );
 7088 %}
 7089 
 7090 // --------------------------------- XOR --------------------------------------
 7091 
 7092 instruct vxor(vec dst, vec src) %{
 7093   predicate(UseAVX == 0);
 7094   match(Set dst (XorV dst src));
 7095   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7096   ins_encode %{
 7097     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7098   %}
 7099   ins_pipe( pipe_slow );
 7100 %}
 7101 
 7102 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7103   predicate(UseAVX > 0);
 7104   match(Set dst (XorV src1 src2));
 7105   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7106   ins_encode %{
 7107     int vlen_enc = vector_length_encoding(this);
 7108     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7109   %}
 7110   ins_pipe( pipe_slow );
 7111 %}
 7112 
 7113 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7114   predicate((UseAVX > 0) &&
 7115             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7116   match(Set dst (XorV src (LoadVector mem)));
 7117   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7118   ins_encode %{
 7119     int vlen_enc = vector_length_encoding(this);
 7120     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7121   %}
 7122   ins_pipe( pipe_slow );
 7123 %}
 7124 
 7125 // --------------------------------- VectorCast --------------------------------------
 7126 
 7127 instruct vcastBtoX(vec dst, vec src) %{
 7128   match(Set dst (VectorCastB2X src));
 7129   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7130   ins_encode %{
 7131     assert(UseAVX > 0, "required");
 7132 
 7133     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7134     int vlen_enc = vector_length_encoding(this);
 7135     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7136   %}
 7137   ins_pipe( pipe_slow );
 7138 %}
 7139 
 7140 instruct castStoX(vec dst, vec src) %{
 7141   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7142             Matcher::vector_length(n->in(1)) <= 8 && // src
 7143             Matcher::vector_element_basic_type(n) == T_BYTE);
 7144   match(Set dst (VectorCastS2X src));
 7145   format %{ "vector_cast_s2x $dst,$src" %}
 7146   ins_encode %{
 7147     assert(UseAVX > 0, "required");
 7148 
 7149     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7150     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7151   %}
 7152   ins_pipe( pipe_slow );
 7153 %}
 7154 
 7155 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7156   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7157             Matcher::vector_length(n->in(1)) == 16 && // src
 7158             Matcher::vector_element_basic_type(n) == T_BYTE);
 7159   effect(TEMP dst, TEMP vtmp);
 7160   match(Set dst (VectorCastS2X src));
 7161   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7162   ins_encode %{
 7163     assert(UseAVX > 0, "required");
 7164 
 7165     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7166     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7167     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7168     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7169   %}
 7170   ins_pipe( pipe_slow );
 7171 %}
 7172 
 7173 instruct vcastStoX_evex(vec dst, vec src) %{
 7174   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7175             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7176   match(Set dst (VectorCastS2X src));
 7177   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7178   ins_encode %{
 7179     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7180     int src_vlen_enc = vector_length_encoding(this, $src);
 7181     int vlen_enc = vector_length_encoding(this);
 7182     switch (to_elem_bt) {
 7183       case T_BYTE:
 7184         if (!VM_Version::supports_avx512vl()) {
 7185           vlen_enc = Assembler::AVX_512bit;
 7186         }
 7187         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7188         break;
 7189       case T_INT:
 7190         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7191         break;
 7192       case T_FLOAT:
 7193         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7194         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7195         break;
 7196       case T_LONG:
 7197         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7198         break;
 7199       case T_DOUBLE: {
 7200         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7201         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7202         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7203         break;
 7204       }
 7205       default:
 7206         ShouldNotReachHere();
 7207     }
 7208   %}
 7209   ins_pipe( pipe_slow );
 7210 %}
 7211 
 7212 instruct castItoX(vec dst, vec src) %{
 7213   predicate(UseAVX <= 2 &&
 7214             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7215             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7216   match(Set dst (VectorCastI2X src));
 7217   format %{ "vector_cast_i2x $dst,$src" %}
 7218   ins_encode %{
 7219     assert(UseAVX > 0, "required");
 7220 
 7221     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7222     int vlen_enc = vector_length_encoding(this, $src);
 7223 
 7224     if (to_elem_bt == T_BYTE) {
 7225       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7226       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7227       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7228     } else {
 7229       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7230       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7231       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7232     }
 7233   %}
 7234   ins_pipe( pipe_slow );
 7235 %}
 7236 
 7237 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7238   predicate(UseAVX <= 2 &&
 7239             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7240             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7241   match(Set dst (VectorCastI2X src));
 7242   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7243   effect(TEMP dst, TEMP vtmp);
 7244   ins_encode %{
 7245     assert(UseAVX > 0, "required");
 7246 
 7247     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7248     int vlen_enc = vector_length_encoding(this, $src);
 7249 
 7250     if (to_elem_bt == T_BYTE) {
 7251       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7252       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7253       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7254       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7255     } else {
 7256       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7257       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7258       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7259       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7260     }
 7261   %}
 7262   ins_pipe( pipe_slow );
 7263 %}
 7264 
 7265 instruct vcastItoX_evex(vec dst, vec src) %{
 7266   predicate(UseAVX > 2 ||
 7267             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7268   match(Set dst (VectorCastI2X src));
 7269   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7270   ins_encode %{
 7271     assert(UseAVX > 0, "required");
 7272 
 7273     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7274     int src_vlen_enc = vector_length_encoding(this, $src);
 7275     int dst_vlen_enc = vector_length_encoding(this);
 7276     switch (dst_elem_bt) {
 7277       case T_BYTE:
 7278         if (!VM_Version::supports_avx512vl()) {
 7279           src_vlen_enc = Assembler::AVX_512bit;
 7280         }
 7281         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7282         break;
 7283       case T_SHORT:
 7284         if (!VM_Version::supports_avx512vl()) {
 7285           src_vlen_enc = Assembler::AVX_512bit;
 7286         }
 7287         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7288         break;
 7289       case T_FLOAT:
 7290         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7291         break;
 7292       case T_LONG:
 7293         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7294         break;
 7295       case T_DOUBLE:
 7296         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7297         break;
 7298       default:
 7299         ShouldNotReachHere();
 7300     }
 7301   %}
 7302   ins_pipe( pipe_slow );
 7303 %}
 7304 
 7305 instruct vcastLtoBS(vec dst, vec src) %{
 7306   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7307             UseAVX <= 2);
 7308   match(Set dst (VectorCastL2X src));
 7309   format %{ "vector_cast_l2x  $dst,$src" %}
 7310   ins_encode %{
 7311     assert(UseAVX > 0, "required");
 7312 
 7313     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7314     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7315     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7316                                                       : ExternalAddress(vector_int_to_short_mask());
 7317     if (vlen <= 16) {
 7318       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7319       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7320       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7321     } else {
 7322       assert(vlen <= 32, "required");
 7323       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7324       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7325       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7326       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7327     }
 7328     if (to_elem_bt == T_BYTE) {
 7329       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7330     }
 7331   %}
 7332   ins_pipe( pipe_slow );
 7333 %}
 7334 
 7335 instruct vcastLtoX_evex(vec dst, vec src) %{
 7336   predicate(UseAVX > 2 ||
 7337             (Matcher::vector_element_basic_type(n) == T_INT ||
 7338              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7339              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7340   match(Set dst (VectorCastL2X src));
 7341   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7342   ins_encode %{
 7343     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7344     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7345     int vlen_enc = vector_length_encoding(this, $src);
 7346     switch (to_elem_bt) {
 7347       case T_BYTE:
 7348         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7349           vlen_enc = Assembler::AVX_512bit;
 7350         }
 7351         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7352         break;
 7353       case T_SHORT:
 7354         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7355           vlen_enc = Assembler::AVX_512bit;
 7356         }
 7357         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7358         break;
 7359       case T_INT:
 7360         if (vlen == 8) {
 7361           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7362             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7363           }
 7364         } else if (vlen == 16) {
 7365           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7366         } else if (vlen == 32) {
 7367           if (UseAVX > 2) {
 7368             if (!VM_Version::supports_avx512vl()) {
 7369               vlen_enc = Assembler::AVX_512bit;
 7370             }
 7371             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7372           } else {
 7373             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7374             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7375           }
 7376         } else { // vlen == 64
 7377           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7378         }
 7379         break;
 7380       case T_FLOAT:
 7381         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7382         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7383         break;
 7384       case T_DOUBLE:
 7385         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7386         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7387         break;
 7388 
 7389       default: assert(false, "%s", type2name(to_elem_bt));
 7390     }
 7391   %}
 7392   ins_pipe( pipe_slow );
 7393 %}
 7394 
 7395 instruct vcastFtoD_reg(vec dst, vec src) %{
 7396   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7397   match(Set dst (VectorCastF2X src));
 7398   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7399   ins_encode %{
 7400     int vlen_enc = vector_length_encoding(this);
 7401     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7402   %}
 7403   ins_pipe( pipe_slow );
 7404 %}
 7405 
 7406 
 7407 instruct castFtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7408   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7409             type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
 7410   match(Set dst (VectorCastF2X src));
 7411   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7412   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
 7413   ins_encode %{
 7414     int vlen_enc = vector_length_encoding(this, $src);
 7415     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7416     // JDK-8292878 removed the need for an explicit scratch register needed to load greater than
 7417     // 32 bit addresses for register indirect addressing mode since stub constants
 7418     // are part of code cache and there is a cap of 2G on ReservedCodeCacheSize currently.
 7419     // However, targets are free to increase this limit, but having a large code cache size
 7420     // greater than 2G looks unreasonable in practical scenario, on the hind side with given
 7421     // cap we save a temporary register allocation which in limiting case can prevent
 7422     // spilling in high register pressure blocks.
 7423     __ vector_castF2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7424                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7425                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7426   %}
 7427   ins_pipe( pipe_slow );
 7428 %}
 7429 
 7430 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7431   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7432             is_integral_type(Matcher::vector_element_basic_type(n)));
 7433   match(Set dst (VectorCastF2X src));
 7434   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7435   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7436   ins_encode %{
 7437     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7438     if (to_elem_bt == T_LONG) {
 7439       int vlen_enc = vector_length_encoding(this);
 7440       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7441                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7442                              ExternalAddress(vector_double_signflip()), noreg, vlen_enc);
 7443     } else {
 7444       int vlen_enc = vector_length_encoding(this, $src);
 7445       __ vector_castF2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7446                              $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7447                              ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7448     }
 7449   %}
 7450   ins_pipe( pipe_slow );
 7451 %}
 7452 
 7453 instruct vcastDtoF_reg(vec dst, vec src) %{
 7454   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7455   match(Set dst (VectorCastD2X src));
 7456   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7457   ins_encode %{
 7458     int vlen_enc = vector_length_encoding(this, $src);
 7459     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7460   %}
 7461   ins_pipe( pipe_slow );
 7462 %}
 7463 
 7464 instruct castDtoX_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, vec xtmp5, rFlagsReg cr) %{
 7465   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64 &&
 7466             is_integral_type(Matcher::vector_element_basic_type(n)));
 7467   match(Set dst (VectorCastD2X src));
 7468   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP xtmp5, KILL cr);
 7469   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $xtmp5 as TEMP" %}
 7470   ins_encode %{
 7471     int vlen_enc = vector_length_encoding(this, $src);
 7472     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7473     __ vector_castD2X_avx(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7474                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister, $xtmp5$$XMMRegister,
 7475                           ExternalAddress(vector_float_signflip()), noreg, vlen_enc);
 7476   %}
 7477   ins_pipe( pipe_slow );
 7478 %}
 7479 
 7480 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7481   predicate((VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n->in(1)) == 64) &&
 7482             is_integral_type(Matcher::vector_element_basic_type(n)));
 7483   match(Set dst (VectorCastD2X src));
 7484   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7485   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
 7486   ins_encode %{
 7487     int vlen_enc = vector_length_encoding(this, $src);
 7488     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7489     AddressLiteral signflip = VM_Version::supports_avx512dq() ? ExternalAddress(vector_double_signflip()) :
 7490                               ExternalAddress(vector_float_signflip());
 7491     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7492                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister, signflip, noreg, vlen_enc);
 7493   %}
 7494   ins_pipe( pipe_slow );
 7495 %}
 7496 
 7497 instruct vucast(vec dst, vec src) %{
 7498   match(Set dst (VectorUCastB2X src));
 7499   match(Set dst (VectorUCastS2X src));
 7500   match(Set dst (VectorUCastI2X src));
 7501   format %{ "vector_ucast $dst,$src\t!" %}
 7502   ins_encode %{
 7503     assert(UseAVX > 0, "required");
 7504 
 7505     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7506     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7507     int vlen_enc = vector_length_encoding(this);
 7508     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7509   %}
 7510   ins_pipe( pipe_slow );
 7511 %}
 7512 
 7513 #ifdef _LP64
 7514 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7515   predicate(!VM_Version::supports_avx512vl() &&
 7516             Matcher::vector_length_in_bytes(n) < 64 &&
 7517             Matcher::vector_element_basic_type(n) == T_INT);
 7518   match(Set dst (RoundVF src));
 7519   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7520   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7521   ins_encode %{
 7522     int vlen_enc = vector_length_encoding(this);
 7523     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7524     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7525                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7526                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7527   %}
 7528   ins_pipe( pipe_slow );
 7529 %}
 7530 
 7531 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7532   predicate((VM_Version::supports_avx512vl() ||
 7533              Matcher::vector_length_in_bytes(n) == 64) &&
 7534              Matcher::vector_element_basic_type(n) == T_INT);
 7535   match(Set dst (RoundVF src));
 7536   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7537   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7538   ins_encode %{
 7539     int vlen_enc = vector_length_encoding(this);
 7540     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7541     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7542                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7543                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7544   %}
 7545   ins_pipe( pipe_slow );
 7546 %}
 7547 
 7548 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7549   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7550   match(Set dst (RoundVD src));
 7551   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7552   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7553   ins_encode %{
 7554     int vlen_enc = vector_length_encoding(this);
 7555     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7556     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7557                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7558                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7559   %}
 7560   ins_pipe( pipe_slow );
 7561 %}
 7562 
 7563 #endif // _LP64
 7564 
 7565 // --------------------------------- VectorMaskCmp --------------------------------------
 7566 
 7567 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7568   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7569             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7570             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7571             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7572   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7573   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7574   ins_encode %{
 7575     int vlen_enc = vector_length_encoding(this, $src1);
 7576     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7577     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7578       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7579     } else {
 7580       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7581     }
 7582   %}
 7583   ins_pipe( pipe_slow );
 7584 %}
 7585 
 7586 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7587   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7588             n->bottom_type()->isa_vectmask() == NULL &&
 7589             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7590   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7591   effect(TEMP ktmp);
 7592   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7593   ins_encode %{
 7594     int vlen_enc = Assembler::AVX_512bit;
 7595     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7596     KRegister mask = k0; // The comparison itself is not being masked.
 7597     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7598       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7599       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7600     } else {
 7601       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7602       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7603     }
 7604   %}
 7605   ins_pipe( pipe_slow );
 7606 %}
 7607 
 7608 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7609   predicate(n->bottom_type()->isa_vectmask() &&
 7610             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7611   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7612   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7613   ins_encode %{
 7614     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7615     int vlen_enc = vector_length_encoding(this, $src1);
 7616     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7617     KRegister mask = k0; // The comparison itself is not being masked.
 7618     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7619       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7620     } else {
 7621       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7622     }
 7623   %}
 7624   ins_pipe( pipe_slow );
 7625 %}
 7626 
 7627 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7628   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7629             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7630             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7631             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7632             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7633             (n->in(2)->get_int() == BoolTest::eq ||
 7634              n->in(2)->get_int() == BoolTest::lt ||
 7635              n->in(2)->get_int() == BoolTest::gt)); // cond
 7636   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7637   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7638   ins_encode %{
 7639     int vlen_enc = vector_length_encoding(this, $src1);
 7640     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7641     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7642     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7643   %}
 7644   ins_pipe( pipe_slow );
 7645 %}
 7646 
 7647 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7648   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7649             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7650             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7651             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7652             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7653             (n->in(2)->get_int() == BoolTest::ne ||
 7654              n->in(2)->get_int() == BoolTest::le ||
 7655              n->in(2)->get_int() == BoolTest::ge)); // cond
 7656   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7657   effect(TEMP dst, TEMP xtmp);
 7658   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7659   ins_encode %{
 7660     int vlen_enc = vector_length_encoding(this, $src1);
 7661     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7662     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7663     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7664   %}
 7665   ins_pipe( pipe_slow );
 7666 %}
 7667 
 7668 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7669   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7670             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7671             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7672             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7673             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7674   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7675   effect(TEMP dst, TEMP xtmp);
 7676   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7677   ins_encode %{
 7678     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7679     int vlen_enc = vector_length_encoding(this, $src1);
 7680     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7681     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7682 
 7683     if (vlen_enc == Assembler::AVX_128bit) {
 7684       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7685     } else {
 7686       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7687     }
 7688     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7689     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7690     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7691   %}
 7692   ins_pipe( pipe_slow );
 7693 %}
 7694 
 7695 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7696   predicate((n->bottom_type()->isa_vectmask() == NULL &&
 7697              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7698              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7699   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7700   effect(TEMP ktmp);
 7701   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7702   ins_encode %{
 7703     assert(UseAVX > 2, "required");
 7704 
 7705     int vlen_enc = vector_length_encoding(this, $src1);
 7706     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7707     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7708     KRegister mask = k0; // The comparison itself is not being masked.
 7709     bool merge = false;
 7710     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7711 
 7712     switch (src1_elem_bt) {
 7713       case T_INT: {
 7714         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7715         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7716         break;
 7717       }
 7718       case T_LONG: {
 7719         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7720         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7721         break;
 7722       }
 7723       default: assert(false, "%s", type2name(src1_elem_bt));
 7724     }
 7725   %}
 7726   ins_pipe( pipe_slow );
 7727 %}
 7728 
 7729 
 7730 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7731   predicate(n->bottom_type()->isa_vectmask() &&
 7732             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7733   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7734   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7735   ins_encode %{
 7736     assert(UseAVX > 2, "required");
 7737     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7738 
 7739     int vlen_enc = vector_length_encoding(this, $src1);
 7740     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7741     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7742     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7743 
 7744     // Comparison i
 7745     switch (src1_elem_bt) {
 7746       case T_BYTE: {
 7747         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7748         break;
 7749       }
 7750       case T_SHORT: {
 7751         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7752         break;
 7753       }
 7754       case T_INT: {
 7755         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7756         break;
 7757       }
 7758       case T_LONG: {
 7759         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7760         break;
 7761       }
 7762       default: assert(false, "%s", type2name(src1_elem_bt));
 7763     }
 7764   %}
 7765   ins_pipe( pipe_slow );
 7766 %}
 7767 
 7768 // Extract
 7769 
 7770 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7771   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7772   match(Set dst (ExtractI src idx));
 7773   match(Set dst (ExtractS src idx));
 7774 #ifdef _LP64
 7775   match(Set dst (ExtractB src idx));
 7776 #endif
 7777   format %{ "extractI $dst,$src,$idx\t!" %}
 7778   ins_encode %{
 7779     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7780 
 7781     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7782     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7783   %}
 7784   ins_pipe( pipe_slow );
 7785 %}
 7786 
 7787 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7788   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7789             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7790   match(Set dst (ExtractI src idx));
 7791   match(Set dst (ExtractS src idx));
 7792 #ifdef _LP64
 7793   match(Set dst (ExtractB src idx));
 7794 #endif
 7795   effect(TEMP vtmp);
 7796   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7797   ins_encode %{
 7798     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7799 
 7800     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7801     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7802     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7803   %}
 7804   ins_pipe( pipe_slow );
 7805 %}
 7806 
 7807 #ifdef _LP64
 7808 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7809   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7810   match(Set dst (ExtractL src idx));
 7811   format %{ "extractL $dst,$src,$idx\t!" %}
 7812   ins_encode %{
 7813     assert(UseSSE >= 4, "required");
 7814     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7815 
 7816     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7817   %}
 7818   ins_pipe( pipe_slow );
 7819 %}
 7820 
 7821 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7822   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7823             Matcher::vector_length(n->in(1)) == 8);  // src
 7824   match(Set dst (ExtractL src idx));
 7825   effect(TEMP vtmp);
 7826   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7827   ins_encode %{
 7828     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7829 
 7830     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7831     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7832   %}
 7833   ins_pipe( pipe_slow );
 7834 %}
 7835 #endif
 7836 
 7837 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7838   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7839   match(Set dst (ExtractF src idx));
 7840   effect(TEMP dst, TEMP vtmp);
 7841   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7842   ins_encode %{
 7843     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7844 
 7845     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7846   %}
 7847   ins_pipe( pipe_slow );
 7848 %}
 7849 
 7850 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7851   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7852             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7853   match(Set dst (ExtractF src idx));
 7854   effect(TEMP vtmp);
 7855   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7856   ins_encode %{
 7857     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7858 
 7859     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7860     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7861   %}
 7862   ins_pipe( pipe_slow );
 7863 %}
 7864 
 7865 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7866   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7867   match(Set dst (ExtractD src idx));
 7868   format %{ "extractD $dst,$src,$idx\t!" %}
 7869   ins_encode %{
 7870     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7871 
 7872     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7873   %}
 7874   ins_pipe( pipe_slow );
 7875 %}
 7876 
 7877 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7878   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7879             Matcher::vector_length(n->in(1)) == 8);  // src
 7880   match(Set dst (ExtractD src idx));
 7881   effect(TEMP vtmp);
 7882   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7883   ins_encode %{
 7884     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7885 
 7886     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7887     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7888   %}
 7889   ins_pipe( pipe_slow );
 7890 %}
 7891 
 7892 // --------------------------------- Vector Blend --------------------------------------
 7893 
 7894 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7895   predicate(UseAVX == 0);
 7896   match(Set dst (VectorBlend (Binary dst src) mask));
 7897   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7898   effect(TEMP tmp);
 7899   ins_encode %{
 7900     assert(UseSSE >= 4, "required");
 7901 
 7902     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7903       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7904     }
 7905     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7906   %}
 7907   ins_pipe( pipe_slow );
 7908 %}
 7909 
 7910 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7911   predicate(UseAVX > 0 &&
 7912             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7913             Matcher::vector_length_in_bytes(n) <= 32 &&
 7914             is_integral_type(Matcher::vector_element_basic_type(n)));
 7915   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7916   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7917   ins_encode %{
 7918     int vlen_enc = vector_length_encoding(this);
 7919     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7920   %}
 7921   ins_pipe( pipe_slow );
 7922 %}
 7923 
 7924 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7925   predicate(UseAVX > 0 &&
 7926             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7927             Matcher::vector_length_in_bytes(n) <= 32 &&
 7928             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7929   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7930   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7931   ins_encode %{
 7932     int vlen_enc = vector_length_encoding(this);
 7933     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7934   %}
 7935   ins_pipe( pipe_slow );
 7936 %}
 7937 
 7938 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7939   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7940             n->in(2)->bottom_type()->isa_vectmask() == NULL);
 7941   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7942   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7943   effect(TEMP ktmp);
 7944   ins_encode %{
 7945      int vlen_enc = Assembler::AVX_512bit;
 7946      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7947     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7948     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7949   %}
 7950   ins_pipe( pipe_slow );
 7951 %}
 7952 
 7953 
 7954 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7955   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7956             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7957              VM_Version::supports_avx512bw()));
 7958   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7959   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7960   ins_encode %{
 7961     int vlen_enc = vector_length_encoding(this);
 7962     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7963     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7964   %}
 7965   ins_pipe( pipe_slow );
 7966 %}
 7967 
 7968 // --------------------------------- ABS --------------------------------------
 7969 // a = |a|
 7970 instruct vabsB_reg(vec dst, vec src) %{
 7971   match(Set dst (AbsVB  src));
 7972   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7973   ins_encode %{
 7974     uint vlen = Matcher::vector_length(this);
 7975     if (vlen <= 16) {
 7976       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7977     } else {
 7978       int vlen_enc = vector_length_encoding(this);
 7979       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7980     }
 7981   %}
 7982   ins_pipe( pipe_slow );
 7983 %}
 7984 
 7985 instruct vabsS_reg(vec dst, vec src) %{
 7986   match(Set dst (AbsVS  src));
 7987   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7988   ins_encode %{
 7989     uint vlen = Matcher::vector_length(this);
 7990     if (vlen <= 8) {
 7991       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7992     } else {
 7993       int vlen_enc = vector_length_encoding(this);
 7994       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7995     }
 7996   %}
 7997   ins_pipe( pipe_slow );
 7998 %}
 7999 
 8000 instruct vabsI_reg(vec dst, vec src) %{
 8001   match(Set dst (AbsVI  src));
 8002   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 8003   ins_encode %{
 8004     uint vlen = Matcher::vector_length(this);
 8005     if (vlen <= 4) {
 8006       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8007     } else {
 8008       int vlen_enc = vector_length_encoding(this);
 8009       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8010     }
 8011   %}
 8012   ins_pipe( pipe_slow );
 8013 %}
 8014 
 8015 instruct vabsL_reg(vec dst, vec src) %{
 8016   match(Set dst (AbsVL  src));
 8017   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 8018   ins_encode %{
 8019     assert(UseAVX > 2, "required");
 8020     int vlen_enc = vector_length_encoding(this);
 8021     if (!VM_Version::supports_avx512vl()) {
 8022       vlen_enc = Assembler::AVX_512bit;
 8023     }
 8024     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8025   %}
 8026   ins_pipe( pipe_slow );
 8027 %}
 8028 
 8029 // --------------------------------- ABSNEG --------------------------------------
 8030 
 8031 instruct vabsnegF(vec dst, vec src) %{
 8032   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8033   match(Set dst (AbsVF src));
 8034   match(Set dst (NegVF src));
 8035   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8036   ins_cost(150);
 8037   ins_encode %{
 8038     int opcode = this->ideal_Opcode();
 8039     int vlen = Matcher::vector_length(this);
 8040     if (vlen == 2) {
 8041       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8042     } else {
 8043       assert(vlen == 8 || vlen == 16, "required");
 8044       int vlen_enc = vector_length_encoding(this);
 8045       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8046     }
 8047   %}
 8048   ins_pipe( pipe_slow );
 8049 %}
 8050 
 8051 instruct vabsneg4F(vec dst) %{
 8052   predicate(Matcher::vector_length(n) == 4);
 8053   match(Set dst (AbsVF dst));
 8054   match(Set dst (NegVF dst));
 8055   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8056   ins_cost(150);
 8057   ins_encode %{
 8058     int opcode = this->ideal_Opcode();
 8059     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8060   %}
 8061   ins_pipe( pipe_slow );
 8062 %}
 8063 
 8064 instruct vabsnegD(vec dst, vec src) %{
 8065   match(Set dst (AbsVD  src));
 8066   match(Set dst (NegVD  src));
 8067   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8068   ins_encode %{
 8069     int opcode = this->ideal_Opcode();
 8070     uint vlen = Matcher::vector_length(this);
 8071     if (vlen == 2) {
 8072       assert(UseSSE >= 2, "required");
 8073       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8074     } else {
 8075       int vlen_enc = vector_length_encoding(this);
 8076       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8077     }
 8078   %}
 8079   ins_pipe( pipe_slow );
 8080 %}
 8081 
 8082 //------------------------------------- VectorTest --------------------------------------------
 8083 
 8084 #ifdef _LP64
 8085 instruct vptest_lt16(rFlagsRegU cr, legVec src1, legVec src2, legVec vtmp) %{
 8086   predicate(Matcher::vector_length_in_bytes(n->in(1)) < 16);
 8087   match(Set cr (VectorTest src1 src2));
 8088   effect(TEMP vtmp);
 8089   format %{ "vptest_lt16  $src1, $src2\t! using $vtmp as TEMP" %}
 8090   ins_encode %{
 8091     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8092     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8093     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister, vlen);
 8094   %}
 8095   ins_pipe( pipe_slow );
 8096 %}
 8097 
 8098 instruct vptest_ge16(rFlagsRegU cr, legVec src1, legVec src2) %{
 8099   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16);
 8100   match(Set cr (VectorTest src1 src2));
 8101   format %{ "vptest_ge16  $src1, $src2\n\t" %}
 8102   ins_encode %{
 8103     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
 8104     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8105     __ vectortest(bt, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, vlen);
 8106   %}
 8107   ins_pipe( pipe_slow );
 8108 %}
 8109 
 8110 instruct ktest_alltrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8111   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8112              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8113             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8114   match(Set cr (VectorTest src1 src2));
 8115   effect(TEMP tmp);
 8116   format %{ "ktest_alltrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8117   ins_encode %{
 8118     uint masklen = Matcher::vector_length(this, $src1);
 8119     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8120     __ andl($tmp$$Register, (1 << masklen) - 1);
 8121     __ cmpl($tmp$$Register, (1 << masklen) - 1);
 8122   %}
 8123   ins_pipe( pipe_slow );
 8124 %}
 8125 
 8126 instruct ktest_anytrue_le8(rFlagsRegU cr, kReg src1, kReg src2, rRegI tmp) %{
 8127   predicate((Matcher::vector_length(n->in(1)) < 8 ||
 8128              (Matcher::vector_length(n->in(1)) == 8 && !VM_Version::supports_avx512dq())) &&
 8129             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8130   match(Set cr (VectorTest src1 src2));
 8131   effect(TEMP tmp);
 8132   format %{ "ktest_anytrue_le8  $src1, $src2\t! using $tmp as TEMP" %}
 8133   ins_encode %{
 8134     uint masklen = Matcher::vector_length(this, $src1);
 8135     __ kmovwl($tmp$$Register, $src1$$KRegister);
 8136     __ andl($tmp$$Register, (1 << masklen) - 1);
 8137   %}
 8138   ins_pipe( pipe_slow );
 8139 %}
 8140 
 8141 instruct ktest_ge8(rFlagsRegU cr, kReg src1, kReg src2) %{
 8142   predicate(Matcher::vector_length(n->in(1)) >= 16 ||
 8143             (Matcher::vector_length(n->in(1)) == 8 && VM_Version::supports_avx512dq()));
 8144   match(Set cr (VectorTest src1 src2));
 8145   format %{ "ktest_ge8  $src1, $src2\n\t" %}
 8146   ins_encode %{
 8147     uint masklen = Matcher::vector_length(this, $src1);
 8148     __ kortest(masklen, $src1$$KRegister, $src1$$KRegister);
 8149   %}
 8150   ins_pipe( pipe_slow );
 8151 %}
 8152 #endif
 8153 
 8154 //------------------------------------- LoadMask --------------------------------------------
 8155 
 8156 instruct loadMask(legVec dst, legVec src) %{
 8157   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
 8158   match(Set dst (VectorLoadMask src));
 8159   effect(TEMP dst);
 8160   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8161   ins_encode %{
 8162     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8163     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8164     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8165   %}
 8166   ins_pipe( pipe_slow );
 8167 %}
 8168 
 8169 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8170   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8171   match(Set dst (VectorLoadMask src));
 8172   effect(TEMP xtmp);
 8173   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8174   ins_encode %{
 8175     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8176                         true, Assembler::AVX_512bit);
 8177   %}
 8178   ins_pipe( pipe_slow );
 8179 %}
 8180 
 8181 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8182   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8183   match(Set dst (VectorLoadMask src));
 8184   effect(TEMP xtmp);
 8185   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8186   ins_encode %{
 8187     int vlen_enc = vector_length_encoding(in(1));
 8188     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8189                         false, vlen_enc);
 8190   %}
 8191   ins_pipe( pipe_slow );
 8192 %}
 8193 
 8194 //------------------------------------- StoreMask --------------------------------------------
 8195 
 8196 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8197   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8198   match(Set dst (VectorStoreMask src size));
 8199   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8200   ins_encode %{
 8201     int vlen = Matcher::vector_length(this);
 8202     if (vlen <= 16 && UseAVX <= 2) {
 8203       assert(UseSSE >= 3, "required");
 8204       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8205     } else {
 8206       assert(UseAVX > 0, "required");
 8207       int src_vlen_enc = vector_length_encoding(this, $src);
 8208       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8209     }
 8210   %}
 8211   ins_pipe( pipe_slow );
 8212 %}
 8213 
 8214 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8215   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8216   match(Set dst (VectorStoreMask src size));
 8217   effect(TEMP_DEF dst, TEMP xtmp);
 8218   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8219   ins_encode %{
 8220     int vlen_enc = Assembler::AVX_128bit;
 8221     int vlen = Matcher::vector_length(this);
 8222     if (vlen <= 8) {
 8223       assert(UseSSE >= 3, "required");
 8224       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8225       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8226       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8227     } else {
 8228       assert(UseAVX > 0, "required");
 8229       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8230       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8231       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8232     }
 8233   %}
 8234   ins_pipe( pipe_slow );
 8235 %}
 8236 
 8237 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8238   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8239   match(Set dst (VectorStoreMask src size));
 8240   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8241   effect(TEMP_DEF dst, TEMP xtmp);
 8242   ins_encode %{
 8243     int vlen_enc = Assembler::AVX_128bit;
 8244     int vlen = Matcher::vector_length(this);
 8245     if (vlen <= 4) {
 8246       assert(UseSSE >= 3, "required");
 8247       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8248       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8249       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8250       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8251     } else {
 8252       assert(UseAVX > 0, "required");
 8253       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8254       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8255       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8256       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8257       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8258     }
 8259   %}
 8260   ins_pipe( pipe_slow );
 8261 %}
 8262 
 8263 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8264   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8265   match(Set dst (VectorStoreMask src size));
 8266   effect(TEMP_DEF dst, TEMP xtmp);
 8267   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8268   ins_encode %{
 8269     assert(UseSSE >= 3, "required");
 8270     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8271     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8272     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8273     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8274     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8275   %}
 8276   ins_pipe( pipe_slow );
 8277 %}
 8278 
 8279 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8280   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8281   match(Set dst (VectorStoreMask src size));
 8282   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8283   effect(TEMP_DEF dst, TEMP vtmp);
 8284   ins_encode %{
 8285     int vlen_enc = Assembler::AVX_128bit;
 8286     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8287     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8288     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8289     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8290     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8291     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8292     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8293   %}
 8294   ins_pipe( pipe_slow );
 8295 %}
 8296 
 8297 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8298   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8299   match(Set dst (VectorStoreMask src size));
 8300   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8301   ins_encode %{
 8302     int src_vlen_enc = vector_length_encoding(this, $src);
 8303     int dst_vlen_enc = vector_length_encoding(this);
 8304     if (!VM_Version::supports_avx512vl()) {
 8305       src_vlen_enc = Assembler::AVX_512bit;
 8306     }
 8307     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8308     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8309   %}
 8310   ins_pipe( pipe_slow );
 8311 %}
 8312 
 8313 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8314   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8315   match(Set dst (VectorStoreMask src size));
 8316   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8317   ins_encode %{
 8318     int src_vlen_enc = vector_length_encoding(this, $src);
 8319     int dst_vlen_enc = vector_length_encoding(this);
 8320     if (!VM_Version::supports_avx512vl()) {
 8321       src_vlen_enc = Assembler::AVX_512bit;
 8322     }
 8323     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8324     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8325   %}
 8326   ins_pipe( pipe_slow );
 8327 %}
 8328 
 8329 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8330   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8331   match(Set dst (VectorStoreMask mask size));
 8332   effect(TEMP_DEF dst);
 8333   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8334   ins_encode %{
 8335     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8336     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8337                  false, Assembler::AVX_512bit, noreg);
 8338     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8339   %}
 8340   ins_pipe( pipe_slow );
 8341 %}
 8342 
 8343 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8344   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8345   match(Set dst (VectorStoreMask mask size));
 8346   effect(TEMP_DEF dst);
 8347   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8348   ins_encode %{
 8349     int dst_vlen_enc = vector_length_encoding(this);
 8350     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8351     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8352   %}
 8353   ins_pipe( pipe_slow );
 8354 %}
 8355 
 8356 instruct vmaskcast_evex(kReg dst) %{
 8357   match(Set dst (VectorMaskCast dst));
 8358   ins_cost(0);
 8359   format %{ "vector_mask_cast $dst" %}
 8360   ins_encode %{
 8361     // empty
 8362   %}
 8363   ins_pipe(empty);
 8364 %}
 8365 
 8366 instruct vmaskcast(vec dst) %{
 8367   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
 8368   match(Set dst (VectorMaskCast dst));
 8369   ins_cost(0);
 8370   format %{ "vector_mask_cast $dst" %}
 8371   ins_encode %{
 8372     // empty
 8373   %}
 8374   ins_pipe(empty);
 8375 %}
 8376 
 8377 instruct vmaskcast_avx(vec dst, vec src) %{
 8378   predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)));
 8379   match(Set dst (VectorMaskCast src));
 8380   format %{ "vector_mask_cast $dst, $src" %}
 8381   ins_encode %{
 8382     int vlen = Matcher::vector_length(this);
 8383     BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
 8384     BasicType dst_bt = Matcher::vector_element_basic_type(this);
 8385     __ vector_mask_cast($dst$$XMMRegister, $src$$XMMRegister, dst_bt, src_bt, vlen);
 8386   %}
 8387   ins_pipe(pipe_slow);
 8388 %}
 8389 
 8390 //-------------------------------- Load Iota Indices ----------------------------------
 8391 
 8392 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8393   match(Set dst (VectorLoadConst src));
 8394   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8395   ins_encode %{
 8396      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8397      BasicType bt = Matcher::vector_element_basic_type(this);
 8398      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
 8399   %}
 8400   ins_pipe( pipe_slow );
 8401 %}
 8402 
 8403 #ifdef _LP64
 8404 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8405   match(Set dst (PopulateIndex src1 src2));
 8406   effect(TEMP dst, TEMP vtmp);
 8407   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8408   ins_encode %{
 8409      assert($src2$$constant == 1, "required");
 8410      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8411      int vlen_enc = vector_length_encoding(this);
 8412      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8413      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8414      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8415      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8416   %}
 8417   ins_pipe( pipe_slow );
 8418 %}
 8419 
 8420 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8421   match(Set dst (PopulateIndex src1 src2));
 8422   effect(TEMP dst, TEMP vtmp);
 8423   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8424   ins_encode %{
 8425      assert($src2$$constant == 1, "required");
 8426      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8427      int vlen_enc = vector_length_encoding(this);
 8428      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8429      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8430      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
 8431      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8432   %}
 8433   ins_pipe( pipe_slow );
 8434 %}
 8435 #endif
 8436 //-------------------------------- Rearrange ----------------------------------
 8437 
 8438 // LoadShuffle/Rearrange for Byte
 8439 
 8440 instruct loadShuffleB(vec dst) %{
 8441   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8442   match(Set dst (VectorLoadShuffle dst));
 8443   format %{ "vector_load_shuffle $dst, $dst" %}
 8444   ins_encode %{
 8445     // empty
 8446   %}
 8447   ins_pipe( pipe_slow );
 8448 %}
 8449 
 8450 instruct rearrangeB(vec dst, vec shuffle) %{
 8451   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8452             Matcher::vector_length(n) < 32);
 8453   match(Set dst (VectorRearrange dst shuffle));
 8454   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8455   ins_encode %{
 8456     assert(UseSSE >= 4, "required");
 8457     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8458   %}
 8459   ins_pipe( pipe_slow );
 8460 %}
 8461 
 8462 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8463   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8464             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8465   match(Set dst (VectorRearrange src shuffle));
 8466   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8467   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8468   ins_encode %{
 8469     assert(UseAVX >= 2, "required");
 8470     // Swap src into vtmp1
 8471     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8472     // Shuffle swapped src to get entries from other 128 bit lane
 8473     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8474     // Shuffle original src to get entries from self 128 bit lane
 8475     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8476     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8477     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8478     // Perform the blend
 8479     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8480   %}
 8481   ins_pipe( pipe_slow );
 8482 %}
 8483 
 8484 
 8485 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8486   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8487             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8488   match(Set dst (VectorRearrange src shuffle));
 8489   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8490   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8491   ins_encode %{
 8492     int vlen_enc = vector_length_encoding(this);
 8493     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8494                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8495                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8496   %}
 8497   ins_pipe( pipe_slow );
 8498 %}
 8499 
 8500 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8501   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8502             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8503   match(Set dst (VectorRearrange src shuffle));
 8504   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8505   ins_encode %{
 8506     int vlen_enc = vector_length_encoding(this);
 8507     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8508   %}
 8509   ins_pipe( pipe_slow );
 8510 %}
 8511 
 8512 // LoadShuffle/Rearrange for Short
 8513 
 8514 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8515   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8516             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8517   match(Set dst (VectorLoadShuffle src));
 8518   effect(TEMP dst, TEMP vtmp);
 8519   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8520   ins_encode %{
 8521     // Create a byte shuffle mask from short shuffle mask
 8522     // only byte shuffle instruction available on these platforms
 8523     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8524     if (UseAVX == 0) {
 8525       assert(vlen_in_bytes <= 16, "required");
 8526       // Multiply each shuffle by two to get byte index
 8527       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8528       __ psllw($vtmp$$XMMRegister, 1);
 8529 
 8530       // Duplicate to create 2 copies of byte index
 8531       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8532       __ psllw($dst$$XMMRegister, 8);
 8533       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8534 
 8535       // Add one to get alternate byte index
 8536       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8537       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8538     } else {
 8539       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8540       int vlen_enc = vector_length_encoding(this);
 8541       // Multiply each shuffle by two to get byte index
 8542       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8543       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8544 
 8545       // Duplicate to create 2 copies of byte index
 8546       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8547       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8548 
 8549       // Add one to get alternate byte index
 8550       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8551     }
 8552   %}
 8553   ins_pipe( pipe_slow );
 8554 %}
 8555 
 8556 instruct rearrangeS(vec dst, vec shuffle) %{
 8557   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8558             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8559   match(Set dst (VectorRearrange dst shuffle));
 8560   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8561   ins_encode %{
 8562     assert(UseSSE >= 4, "required");
 8563     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8564   %}
 8565   ins_pipe( pipe_slow );
 8566 %}
 8567 
 8568 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8569   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8570             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8571   match(Set dst (VectorRearrange src shuffle));
 8572   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8573   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8574   ins_encode %{
 8575     assert(UseAVX >= 2, "required");
 8576     // Swap src into vtmp1
 8577     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8578     // Shuffle swapped src to get entries from other 128 bit lane
 8579     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8580     // Shuffle original src to get entries from self 128 bit lane
 8581     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8582     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8583     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8584     // Perform the blend
 8585     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8586   %}
 8587   ins_pipe( pipe_slow );
 8588 %}
 8589 
 8590 instruct loadShuffleS_evex(vec dst, vec src) %{
 8591   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8592             VM_Version::supports_avx512bw());
 8593   match(Set dst (VectorLoadShuffle src));
 8594   format %{ "vector_load_shuffle $dst, $src" %}
 8595   ins_encode %{
 8596     int vlen_enc = vector_length_encoding(this);
 8597     if (!VM_Version::supports_avx512vl()) {
 8598       vlen_enc = Assembler::AVX_512bit;
 8599     }
 8600     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8601   %}
 8602   ins_pipe( pipe_slow );
 8603 %}
 8604 
 8605 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8606   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8607             VM_Version::supports_avx512bw());
 8608   match(Set dst (VectorRearrange src shuffle));
 8609   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8610   ins_encode %{
 8611     int vlen_enc = vector_length_encoding(this);
 8612     if (!VM_Version::supports_avx512vl()) {
 8613       vlen_enc = Assembler::AVX_512bit;
 8614     }
 8615     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8616   %}
 8617   ins_pipe( pipe_slow );
 8618 %}
 8619 
 8620 // LoadShuffle/Rearrange for Integer and Float
 8621 
 8622 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8623   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8624             Matcher::vector_length(n) == 4 && UseAVX < 2);
 8625   match(Set dst (VectorLoadShuffle src));
 8626   effect(TEMP dst, TEMP vtmp);
 8627   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8628   ins_encode %{
 8629     assert(UseSSE >= 4, "required");
 8630 
 8631     // Create a byte shuffle mask from int shuffle mask
 8632     // only byte shuffle instruction available on these platforms
 8633 
 8634     // Duplicate and multiply each shuffle by 4
 8635     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8636     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8637     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8638     __ psllw($vtmp$$XMMRegister, 2);
 8639 
 8640     // Duplicate again to create 4 copies of byte index
 8641     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8642     __ psllw($dst$$XMMRegister, 8);
 8643     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8644 
 8645     // Add 3,2,1,0 to get alternate byte index
 8646     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8647     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8648   %}
 8649   ins_pipe( pipe_slow );
 8650 %}
 8651 
 8652 instruct rearrangeI(vec dst, vec shuffle) %{
 8653  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8654            Matcher::vector_length(n) == 4 && UseAVX < 2);
 8655   match(Set dst (VectorRearrange dst shuffle));
 8656   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8657   ins_encode %{
 8658     assert(UseSSE >= 4, "required");
 8659     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8660   %}
 8661   ins_pipe( pipe_slow );
 8662 %}
 8663 
 8664 instruct loadShuffleI_avx(vec dst, vec src) %{
 8665   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8666             UseAVX >= 2);
 8667   match(Set dst (VectorLoadShuffle src));
 8668   format %{ "vector_load_shuffle $dst, $src" %}
 8669   ins_encode %{
 8670   int vlen_enc = vector_length_encoding(this);
 8671     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8672   %}
 8673   ins_pipe( pipe_slow );
 8674 %}
 8675 
 8676 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8677   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8678             UseAVX >= 2);
 8679   match(Set dst (VectorRearrange src shuffle));
 8680   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8681   ins_encode %{
 8682     int vlen_enc = vector_length_encoding(this);
 8683     if (vlen_enc == Assembler::AVX_128bit) {
 8684       vlen_enc = Assembler::AVX_256bit;
 8685     }
 8686     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8687   %}
 8688   ins_pipe( pipe_slow );
 8689 %}
 8690 
 8691 // LoadShuffle/Rearrange for Long and Double
 8692 
 8693 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8694   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8695             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8696   match(Set dst (VectorLoadShuffle src));
 8697   effect(TEMP dst, TEMP vtmp);
 8698   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8699   ins_encode %{
 8700     assert(UseAVX >= 2, "required");
 8701 
 8702     int vlen_enc = vector_length_encoding(this);
 8703     // Create a double word shuffle mask from long shuffle mask
 8704     // only double word shuffle instruction available on these platforms
 8705 
 8706     // Multiply each shuffle by two to get double word index
 8707     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8708     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8709 
 8710     // Duplicate each double word shuffle
 8711     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8712     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8713 
 8714     // Add one to get alternate double word index
 8715     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8716   %}
 8717   ins_pipe( pipe_slow );
 8718 %}
 8719 
 8720 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8721   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8722             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8723   match(Set dst (VectorRearrange src shuffle));
 8724   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8725   ins_encode %{
 8726     assert(UseAVX >= 2, "required");
 8727 
 8728     int vlen_enc = vector_length_encoding(this);
 8729     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8730   %}
 8731   ins_pipe( pipe_slow );
 8732 %}
 8733 
 8734 instruct loadShuffleL_evex(vec dst, vec src) %{
 8735   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8736             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8737   match(Set dst (VectorLoadShuffle src));
 8738   format %{ "vector_load_shuffle $dst, $src" %}
 8739   ins_encode %{
 8740     assert(UseAVX > 2, "required");
 8741 
 8742     int vlen_enc = vector_length_encoding(this);
 8743     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8744   %}
 8745   ins_pipe( pipe_slow );
 8746 %}
 8747 
 8748 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8749   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8750             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8751   match(Set dst (VectorRearrange src shuffle));
 8752   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8753   ins_encode %{
 8754     assert(UseAVX > 2, "required");
 8755 
 8756     int vlen_enc = vector_length_encoding(this);
 8757     if (vlen_enc == Assembler::AVX_128bit) {
 8758       vlen_enc = Assembler::AVX_256bit;
 8759     }
 8760     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8761   %}
 8762   ins_pipe( pipe_slow );
 8763 %}
 8764 
 8765 // --------------------------------- FMA --------------------------------------
 8766 // a * b + c
 8767 
 8768 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8769   match(Set c (FmaVF  c (Binary a b)));
 8770   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8771   ins_cost(150);
 8772   ins_encode %{
 8773     assert(UseFMA, "not enabled");
 8774     int vlen_enc = vector_length_encoding(this);
 8775     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8776   %}
 8777   ins_pipe( pipe_slow );
 8778 %}
 8779 
 8780 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8781   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8782   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8783   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8784   ins_cost(150);
 8785   ins_encode %{
 8786     assert(UseFMA, "not enabled");
 8787     int vlen_enc = vector_length_encoding(this);
 8788     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8789   %}
 8790   ins_pipe( pipe_slow );
 8791 %}
 8792 
 8793 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8794   match(Set c (FmaVD  c (Binary a b)));
 8795   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8796   ins_cost(150);
 8797   ins_encode %{
 8798     assert(UseFMA, "not enabled");
 8799     int vlen_enc = vector_length_encoding(this);
 8800     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8801   %}
 8802   ins_pipe( pipe_slow );
 8803 %}
 8804 
 8805 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8806   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8807   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8808   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8809   ins_cost(150);
 8810   ins_encode %{
 8811     assert(UseFMA, "not enabled");
 8812     int vlen_enc = vector_length_encoding(this);
 8813     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8814   %}
 8815   ins_pipe( pipe_slow );
 8816 %}
 8817 
 8818 // --------------------------------- Vector Multiply Add --------------------------------------
 8819 
 8820 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8821   predicate(UseAVX == 0);
 8822   match(Set dst (MulAddVS2VI dst src1));
 8823   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8824   ins_encode %{
 8825     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8826   %}
 8827   ins_pipe( pipe_slow );
 8828 %}
 8829 
 8830 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8831   predicate(UseAVX > 0);
 8832   match(Set dst (MulAddVS2VI src1 src2));
 8833   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8834   ins_encode %{
 8835     int vlen_enc = vector_length_encoding(this);
 8836     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8837   %}
 8838   ins_pipe( pipe_slow );
 8839 %}
 8840 
 8841 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8842 
 8843 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8844   predicate(VM_Version::supports_avx512_vnni());
 8845   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8846   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8847   ins_encode %{
 8848     assert(UseAVX > 2, "required");
 8849     int vlen_enc = vector_length_encoding(this);
 8850     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8851   %}
 8852   ins_pipe( pipe_slow );
 8853   ins_cost(10);
 8854 %}
 8855 
 8856 // --------------------------------- PopCount --------------------------------------
 8857 
 8858 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8859   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8860   match(Set dst (PopCountVI src));
 8861   match(Set dst (PopCountVL src));
 8862   format %{ "vector_popcount_integral $dst, $src" %}
 8863   ins_encode %{
 8864     int opcode = this->ideal_Opcode();
 8865     int vlen_enc = vector_length_encoding(this, $src);
 8866     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8867     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8868   %}
 8869   ins_pipe( pipe_slow );
 8870 %}
 8871 
 8872 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8873   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8874   match(Set dst (PopCountVI src mask));
 8875   match(Set dst (PopCountVL src mask));
 8876   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8877   ins_encode %{
 8878     int vlen_enc = vector_length_encoding(this, $src);
 8879     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8880     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8881     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8882   %}
 8883   ins_pipe( pipe_slow );
 8884 %}
 8885 
 8886 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8887   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8888   match(Set dst (PopCountVI src));
 8889   match(Set dst (PopCountVL src));
 8890   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8891   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8892   ins_encode %{
 8893     int opcode = this->ideal_Opcode();
 8894     int vlen_enc = vector_length_encoding(this, $src);
 8895     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8896     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8897                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8898   %}
 8899   ins_pipe( pipe_slow );
 8900 %}
 8901 
 8902 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8903 
 8904 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8905   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8906                                               Matcher::vector_length_in_bytes(n->in(1))));
 8907   match(Set dst (CountTrailingZerosV src));
 8908   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8909   ins_cost(400);
 8910   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8911   ins_encode %{
 8912     int vlen_enc = vector_length_encoding(this, $src);
 8913     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8914     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8915                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8916   %}
 8917   ins_pipe( pipe_slow );
 8918 %}
 8919 
 8920 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8921   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8922             VM_Version::supports_avx512cd() &&
 8923             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8924   match(Set dst (CountTrailingZerosV src));
 8925   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8926   ins_cost(400);
 8927   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8928   ins_encode %{
 8929     int vlen_enc = vector_length_encoding(this, $src);
 8930     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8931     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8932                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8933   %}
 8934   ins_pipe( pipe_slow );
 8935 %}
 8936 
 8937 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8938   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8939   match(Set dst (CountTrailingZerosV src));
 8940   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8941   ins_cost(400);
 8942   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8943   ins_encode %{
 8944     int vlen_enc = vector_length_encoding(this, $src);
 8945     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8946     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8947                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8948                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8949   %}
 8950   ins_pipe( pipe_slow );
 8951 %}
 8952 
 8953 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8954   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8955   match(Set dst (CountTrailingZerosV src));
 8956   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8957   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8958   ins_encode %{
 8959     int vlen_enc = vector_length_encoding(this, $src);
 8960     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8961     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8962                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8963   %}
 8964   ins_pipe( pipe_slow );
 8965 %}
 8966 
 8967 
 8968 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8969 
 8970 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8971   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8972   effect(TEMP dst);
 8973   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8974   ins_encode %{
 8975     int vector_len = vector_length_encoding(this);
 8976     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8977   %}
 8978   ins_pipe( pipe_slow );
 8979 %}
 8980 
 8981 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8982   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8983   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8984   effect(TEMP dst);
 8985   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8986   ins_encode %{
 8987     int vector_len = vector_length_encoding(this);
 8988     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8989   %}
 8990   ins_pipe( pipe_slow );
 8991 %}
 8992 
 8993 // --------------------------------- Rotation Operations ----------------------------------
 8994 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8995   match(Set dst (RotateLeftV src shift));
 8996   match(Set dst (RotateRightV src shift));
 8997   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8998   ins_encode %{
 8999     int opcode      = this->ideal_Opcode();
 9000     int vector_len  = vector_length_encoding(this);
 9001     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9002     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9003   %}
 9004   ins_pipe( pipe_slow );
 9005 %}
 9006 
 9007 instruct vprorate(vec dst, vec src, vec shift) %{
 9008   match(Set dst (RotateLeftV src shift));
 9009   match(Set dst (RotateRightV src shift));
 9010   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9011   ins_encode %{
 9012     int opcode      = this->ideal_Opcode();
 9013     int vector_len  = vector_length_encoding(this);
 9014     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9015     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9016   %}
 9017   ins_pipe( pipe_slow );
 9018 %}
 9019 
 9020 // ---------------------------------- Masked Operations ------------------------------------
 9021 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9022   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9023   match(Set dst (LoadVectorMasked mem mask));
 9024   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9025   ins_encode %{
 9026     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9027     int vlen_enc = vector_length_encoding(this);
 9028     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9029   %}
 9030   ins_pipe( pipe_slow );
 9031 %}
 9032 
 9033 
 9034 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9035   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9036   match(Set dst (LoadVectorMasked mem mask));
 9037   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9038   ins_encode %{
 9039     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9040     int vector_len = vector_length_encoding(this);
 9041     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9042   %}
 9043   ins_pipe( pipe_slow );
 9044 %}
 9045 
 9046 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9047   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9048   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9049   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9050   ins_encode %{
 9051     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9052     int vlen_enc = vector_length_encoding(src_node);
 9053     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9054     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9055   %}
 9056   ins_pipe( pipe_slow );
 9057 %}
 9058 
 9059 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9060   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9061   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9062   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9063   ins_encode %{
 9064     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9065     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9066     int vlen_enc = vector_length_encoding(src_node);
 9067     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9068   %}
 9069   ins_pipe( pipe_slow );
 9070 %}
 9071 
 9072 #ifdef _LP64
 9073 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9074   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9075   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9076   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9077   ins_encode %{
 9078     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9079     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9080 
 9081     Label DONE;
 9082     int vlen_enc = vector_length_encoding(this, $src1);
 9083     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9084 
 9085     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9086     __ mov64($dst$$Register, -1L);
 9087     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9088     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9089     __ jccb(Assembler::carrySet, DONE);
 9090     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9091     __ notq($dst$$Register);
 9092     __ tzcntq($dst$$Register, $dst$$Register);
 9093     __ bind(DONE);
 9094   %}
 9095   ins_pipe( pipe_slow );
 9096 %}
 9097 
 9098 
 9099 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
 9100   match(Set dst (VectorMaskGen len));
 9101   effect(TEMP temp);
 9102   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9103   ins_encode %{
 9104     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9105   %}
 9106   ins_pipe( pipe_slow );
 9107 %}
 9108 
 9109 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9110   match(Set dst (VectorMaskGen len));
 9111   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9112   effect(TEMP temp);
 9113   ins_encode %{
 9114     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9115     __ kmovql($dst$$KRegister, $temp$$Register);
 9116   %}
 9117   ins_pipe( pipe_slow );
 9118 %}
 9119 
 9120 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9121   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9122   match(Set dst (VectorMaskToLong mask));
 9123   effect(TEMP dst, KILL cr);
 9124   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9125   ins_encode %{
 9126     int opcode = this->ideal_Opcode();
 9127     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9128     int mask_len = Matcher::vector_length(this, $mask);
 9129     int mask_size = mask_len * type2aelembytes(mbt);
 9130     int vlen_enc = vector_length_encoding(this, $mask);
 9131     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9132                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9133   %}
 9134   ins_pipe( pipe_slow );
 9135 %}
 9136 
 9137 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9138   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9139   match(Set dst (VectorMaskToLong mask));
 9140   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9141   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9142   ins_encode %{
 9143     int opcode = this->ideal_Opcode();
 9144     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9145     int mask_len = Matcher::vector_length(this, $mask);
 9146     int vlen_enc = vector_length_encoding(this, $mask);
 9147     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9148                              $dst$$Register, mask_len, mbt, vlen_enc);
 9149   %}
 9150   ins_pipe( pipe_slow );
 9151 %}
 9152 
 9153 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9154   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9155   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9156   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9157   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9158   ins_encode %{
 9159     int opcode = this->ideal_Opcode();
 9160     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9161     int mask_len = Matcher::vector_length(this, $mask);
 9162     int vlen_enc = vector_length_encoding(this, $mask);
 9163     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9164                              $dst$$Register, mask_len, mbt, vlen_enc);
 9165   %}
 9166   ins_pipe( pipe_slow );
 9167 %}
 9168 
 9169 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9170   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9171   match(Set dst (VectorMaskTrueCount mask));
 9172   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9173   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9174   ins_encode %{
 9175     int opcode = this->ideal_Opcode();
 9176     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9177     int mask_len = Matcher::vector_length(this, $mask);
 9178     int mask_size = mask_len * type2aelembytes(mbt);
 9179     int vlen_enc = vector_length_encoding(this, $mask);
 9180     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9181                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9182   %}
 9183   ins_pipe( pipe_slow );
 9184 %}
 9185 
 9186 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9187   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9188   match(Set dst (VectorMaskTrueCount mask));
 9189   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9190   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9191   ins_encode %{
 9192     int opcode = this->ideal_Opcode();
 9193     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9194     int mask_len = Matcher::vector_length(this, $mask);
 9195     int vlen_enc = vector_length_encoding(this, $mask);
 9196     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9197                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9198   %}
 9199   ins_pipe( pipe_slow );
 9200 %}
 9201 
 9202 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9203   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9204   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9205   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9206   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9207   ins_encode %{
 9208     int opcode = this->ideal_Opcode();
 9209     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9210     int mask_len = Matcher::vector_length(this, $mask);
 9211     int vlen_enc = vector_length_encoding(this, $mask);
 9212     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9213                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9214   %}
 9215   ins_pipe( pipe_slow );
 9216 %}
 9217 
 9218 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9219   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9220   match(Set dst (VectorMaskFirstTrue mask));
 9221   match(Set dst (VectorMaskLastTrue mask));
 9222   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9223   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9224   ins_encode %{
 9225     int opcode = this->ideal_Opcode();
 9226     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9227     int mask_len = Matcher::vector_length(this, $mask);
 9228     int mask_size = mask_len * type2aelembytes(mbt);
 9229     int vlen_enc = vector_length_encoding(this, $mask);
 9230     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9231                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9232   %}
 9233   ins_pipe( pipe_slow );
 9234 %}
 9235 
 9236 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9237   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9238   match(Set dst (VectorMaskFirstTrue mask));
 9239   match(Set dst (VectorMaskLastTrue mask));
 9240   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9241   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9242   ins_encode %{
 9243     int opcode = this->ideal_Opcode();
 9244     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9245     int mask_len = Matcher::vector_length(this, $mask);
 9246     int vlen_enc = vector_length_encoding(this, $mask);
 9247     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9248                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9249   %}
 9250   ins_pipe( pipe_slow );
 9251 %}
 9252 
 9253 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9254   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9255   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9256   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9257   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9258   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9259   ins_encode %{
 9260     int opcode = this->ideal_Opcode();
 9261     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9262     int mask_len = Matcher::vector_length(this, $mask);
 9263     int vlen_enc = vector_length_encoding(this, $mask);
 9264     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9265                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9266   %}
 9267   ins_pipe( pipe_slow );
 9268 %}
 9269 
 9270 // --------------------------------- Compress/Expand Operations ---------------------------
 9271 
 9272 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9273   match(Set dst (CompressV src mask));
 9274   match(Set dst (ExpandV src mask));
 9275   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9276   ins_encode %{
 9277     int opcode = this->ideal_Opcode();
 9278     int vector_len = vector_length_encoding(this);
 9279     BasicType bt  = Matcher::vector_element_basic_type(this);
 9280     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9281   %}
 9282   ins_pipe( pipe_slow );
 9283 %}
 9284 
 9285 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9286   match(Set dst (CompressM mask));
 9287   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9288   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9289   ins_encode %{
 9290     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9291     int mask_len = Matcher::vector_length(this);
 9292     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9293   %}
 9294   ins_pipe( pipe_slow );
 9295 %}
 9296 
 9297 #endif // _LP64
 9298 
 9299 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9300 
 9301 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9302   predicate(!VM_Version::supports_gfni());
 9303   match(Set dst (ReverseV src));
 9304   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9305   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9306   ins_encode %{
 9307     int vec_enc = vector_length_encoding(this);
 9308     BasicType bt = Matcher::vector_element_basic_type(this);
 9309     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9310                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9311   %}
 9312   ins_pipe( pipe_slow );
 9313 %}
 9314 
 9315 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9316   predicate(VM_Version::supports_gfni());
 9317   match(Set dst (ReverseV src));
 9318   effect(TEMP dst, TEMP xtmp);
 9319   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9320   ins_encode %{
 9321     int vec_enc = vector_length_encoding(this);
 9322     BasicType bt  = Matcher::vector_element_basic_type(this);
 9323     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9324     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9325                                $xtmp$$XMMRegister);
 9326   %}
 9327   ins_pipe( pipe_slow );
 9328 %}
 9329 
 9330 instruct vreverse_byte_reg(vec dst, vec src) %{
 9331   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9332   match(Set dst (ReverseBytesV src));
 9333   effect(TEMP dst);
 9334   format %{ "vector_reverse_byte $dst, $src" %}
 9335   ins_encode %{
 9336     int vec_enc = vector_length_encoding(this);
 9337     BasicType bt = Matcher::vector_element_basic_type(this);
 9338     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9339   %}
 9340   ins_pipe( pipe_slow );
 9341 %}
 9342 
 9343 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9344   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9345   match(Set dst (ReverseBytesV src));
 9346   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9347   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9348   ins_encode %{
 9349     int vec_enc = vector_length_encoding(this);
 9350     BasicType bt = Matcher::vector_element_basic_type(this);
 9351     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9352                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9353   %}
 9354   ins_pipe( pipe_slow );
 9355 %}
 9356 
 9357 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9358 
 9359 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9360   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9361                                               Matcher::vector_length_in_bytes(n->in(1))));
 9362   match(Set dst (CountLeadingZerosV src));
 9363   format %{ "vector_count_leading_zeros $dst, $src" %}
 9364   ins_encode %{
 9365      int vlen_enc = vector_length_encoding(this, $src);
 9366      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9367      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9368                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9369   %}
 9370   ins_pipe( pipe_slow );
 9371 %}
 9372 
 9373 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9374   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9375                                               Matcher::vector_length_in_bytes(n->in(1))));
 9376   match(Set dst (CountLeadingZerosV src mask));
 9377   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9378   ins_encode %{
 9379     int vlen_enc = vector_length_encoding(this, $src);
 9380     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9381     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9382     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9383                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9384   %}
 9385   ins_pipe( pipe_slow );
 9386 %}
 9387 
 9388 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9389   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9390             VM_Version::supports_avx512cd() &&
 9391             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9392   match(Set dst (CountLeadingZerosV src));
 9393   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9394   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9395   ins_encode %{
 9396     int vlen_enc = vector_length_encoding(this, $src);
 9397     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9398     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9399                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9400   %}
 9401   ins_pipe( pipe_slow );
 9402 %}
 9403 
 9404 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9405   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9406   match(Set dst (CountLeadingZerosV src));
 9407   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9408   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9409   ins_encode %{
 9410     int vlen_enc = vector_length_encoding(this, $src);
 9411     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9412     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9413                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9414                                        $rtmp$$Register, true, vlen_enc);
 9415   %}
 9416   ins_pipe( pipe_slow );
 9417 %}
 9418 
 9419 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9420   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9421             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9422   match(Set dst (CountLeadingZerosV src));
 9423   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9424   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9425   ins_encode %{
 9426     int vlen_enc = vector_length_encoding(this, $src);
 9427     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9428     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9429                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9430   %}
 9431   ins_pipe( pipe_slow );
 9432 %}
 9433 
 9434 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9435   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9436             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9437   match(Set dst (CountLeadingZerosV src));
 9438   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9439   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9440   ins_encode %{
 9441     int vlen_enc = vector_length_encoding(this, $src);
 9442     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9443     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9444                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9445   %}
 9446   ins_pipe( pipe_slow );
 9447 %}
 9448 
 9449 // ---------------------------------- Vector Masked Operations ------------------------------------
 9450 
 9451 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9452   match(Set dst (AddVB (Binary dst src2) mask));
 9453   match(Set dst (AddVS (Binary dst src2) mask));
 9454   match(Set dst (AddVI (Binary dst src2) mask));
 9455   match(Set dst (AddVL (Binary dst src2) mask));
 9456   match(Set dst (AddVF (Binary dst src2) mask));
 9457   match(Set dst (AddVD (Binary dst src2) mask));
 9458   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9459   ins_encode %{
 9460     int vlen_enc = vector_length_encoding(this);
 9461     BasicType bt = Matcher::vector_element_basic_type(this);
 9462     int opc = this->ideal_Opcode();
 9463     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9464                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9465   %}
 9466   ins_pipe( pipe_slow );
 9467 %}
 9468 
 9469 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9470   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9471   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9472   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9473   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9474   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9475   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9476   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9477   ins_encode %{
 9478     int vlen_enc = vector_length_encoding(this);
 9479     BasicType bt = Matcher::vector_element_basic_type(this);
 9480     int opc = this->ideal_Opcode();
 9481     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9482                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9483   %}
 9484   ins_pipe( pipe_slow );
 9485 %}
 9486 
 9487 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9488   match(Set dst (XorV (Binary dst src2) mask));
 9489   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9490   ins_encode %{
 9491     int vlen_enc = vector_length_encoding(this);
 9492     BasicType bt = Matcher::vector_element_basic_type(this);
 9493     int opc = this->ideal_Opcode();
 9494     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9495                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9496   %}
 9497   ins_pipe( pipe_slow );
 9498 %}
 9499 
 9500 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9501   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9502   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9503   ins_encode %{
 9504     int vlen_enc = vector_length_encoding(this);
 9505     BasicType bt = Matcher::vector_element_basic_type(this);
 9506     int opc = this->ideal_Opcode();
 9507     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9508                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9509   %}
 9510   ins_pipe( pipe_slow );
 9511 %}
 9512 
 9513 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9514   match(Set dst (OrV (Binary dst src2) mask));
 9515   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9516   ins_encode %{
 9517     int vlen_enc = vector_length_encoding(this);
 9518     BasicType bt = Matcher::vector_element_basic_type(this);
 9519     int opc = this->ideal_Opcode();
 9520     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9521                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9522   %}
 9523   ins_pipe( pipe_slow );
 9524 %}
 9525 
 9526 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9527   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9528   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9529   ins_encode %{
 9530     int vlen_enc = vector_length_encoding(this);
 9531     BasicType bt = Matcher::vector_element_basic_type(this);
 9532     int opc = this->ideal_Opcode();
 9533     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9534                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9535   %}
 9536   ins_pipe( pipe_slow );
 9537 %}
 9538 
 9539 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9540   match(Set dst (AndV (Binary dst src2) mask));
 9541   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9542   ins_encode %{
 9543     int vlen_enc = vector_length_encoding(this);
 9544     BasicType bt = Matcher::vector_element_basic_type(this);
 9545     int opc = this->ideal_Opcode();
 9546     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9547                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9548   %}
 9549   ins_pipe( pipe_slow );
 9550 %}
 9551 
 9552 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9553   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9554   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9555   ins_encode %{
 9556     int vlen_enc = vector_length_encoding(this);
 9557     BasicType bt = Matcher::vector_element_basic_type(this);
 9558     int opc = this->ideal_Opcode();
 9559     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9560                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9561   %}
 9562   ins_pipe( pipe_slow );
 9563 %}
 9564 
 9565 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9566   match(Set dst (SubVB (Binary dst src2) mask));
 9567   match(Set dst (SubVS (Binary dst src2) mask));
 9568   match(Set dst (SubVI (Binary dst src2) mask));
 9569   match(Set dst (SubVL (Binary dst src2) mask));
 9570   match(Set dst (SubVF (Binary dst src2) mask));
 9571   match(Set dst (SubVD (Binary dst src2) mask));
 9572   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9573   ins_encode %{
 9574     int vlen_enc = vector_length_encoding(this);
 9575     BasicType bt = Matcher::vector_element_basic_type(this);
 9576     int opc = this->ideal_Opcode();
 9577     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9578                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9579   %}
 9580   ins_pipe( pipe_slow );
 9581 %}
 9582 
 9583 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9584   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9585   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9586   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9587   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9588   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9589   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9590   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9591   ins_encode %{
 9592     int vlen_enc = vector_length_encoding(this);
 9593     BasicType bt = Matcher::vector_element_basic_type(this);
 9594     int opc = this->ideal_Opcode();
 9595     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9596                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9597   %}
 9598   ins_pipe( pipe_slow );
 9599 %}
 9600 
 9601 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9602   match(Set dst (MulVS (Binary dst src2) mask));
 9603   match(Set dst (MulVI (Binary dst src2) mask));
 9604   match(Set dst (MulVL (Binary dst src2) mask));
 9605   match(Set dst (MulVF (Binary dst src2) mask));
 9606   match(Set dst (MulVD (Binary dst src2) mask));
 9607   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9608   ins_encode %{
 9609     int vlen_enc = vector_length_encoding(this);
 9610     BasicType bt = Matcher::vector_element_basic_type(this);
 9611     int opc = this->ideal_Opcode();
 9612     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9613                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9614   %}
 9615   ins_pipe( pipe_slow );
 9616 %}
 9617 
 9618 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9619   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9620   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9621   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9622   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9623   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9624   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9625   ins_encode %{
 9626     int vlen_enc = vector_length_encoding(this);
 9627     BasicType bt = Matcher::vector_element_basic_type(this);
 9628     int opc = this->ideal_Opcode();
 9629     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9630                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9631   %}
 9632   ins_pipe( pipe_slow );
 9633 %}
 9634 
 9635 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9636   match(Set dst (SqrtVF dst mask));
 9637   match(Set dst (SqrtVD dst mask));
 9638   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9639   ins_encode %{
 9640     int vlen_enc = vector_length_encoding(this);
 9641     BasicType bt = Matcher::vector_element_basic_type(this);
 9642     int opc = this->ideal_Opcode();
 9643     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9644                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9645   %}
 9646   ins_pipe( pipe_slow );
 9647 %}
 9648 
 9649 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9650   match(Set dst (DivVF (Binary dst src2) mask));
 9651   match(Set dst (DivVD (Binary dst src2) mask));
 9652   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9653   ins_encode %{
 9654     int vlen_enc = vector_length_encoding(this);
 9655     BasicType bt = Matcher::vector_element_basic_type(this);
 9656     int opc = this->ideal_Opcode();
 9657     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9658                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9659   %}
 9660   ins_pipe( pipe_slow );
 9661 %}
 9662 
 9663 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9664   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9665   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9666   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9667   ins_encode %{
 9668     int vlen_enc = vector_length_encoding(this);
 9669     BasicType bt = Matcher::vector_element_basic_type(this);
 9670     int opc = this->ideal_Opcode();
 9671     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9672                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9673   %}
 9674   ins_pipe( pipe_slow );
 9675 %}
 9676 
 9677 
 9678 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9679   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9680   match(Set dst (RotateRightV (Binary dst shift) mask));
 9681   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9682   ins_encode %{
 9683     int vlen_enc = vector_length_encoding(this);
 9684     BasicType bt = Matcher::vector_element_basic_type(this);
 9685     int opc = this->ideal_Opcode();
 9686     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9687                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9688   %}
 9689   ins_pipe( pipe_slow );
 9690 %}
 9691 
 9692 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9693   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9694   match(Set dst (RotateRightV (Binary dst src2) mask));
 9695   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9696   ins_encode %{
 9697     int vlen_enc = vector_length_encoding(this);
 9698     BasicType bt = Matcher::vector_element_basic_type(this);
 9699     int opc = this->ideal_Opcode();
 9700     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9701                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9702   %}
 9703   ins_pipe( pipe_slow );
 9704 %}
 9705 
 9706 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9707   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9708   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9709   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9710   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9711   ins_encode %{
 9712     int vlen_enc = vector_length_encoding(this);
 9713     BasicType bt = Matcher::vector_element_basic_type(this);
 9714     int opc = this->ideal_Opcode();
 9715     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9716                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9717   %}
 9718   ins_pipe( pipe_slow );
 9719 %}
 9720 
 9721 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9722   predicate(!n->as_ShiftV()->is_var_shift());
 9723   match(Set dst (LShiftVS (Binary dst src2) mask));
 9724   match(Set dst (LShiftVI (Binary dst src2) mask));
 9725   match(Set dst (LShiftVL (Binary dst src2) mask));
 9726   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9727   ins_encode %{
 9728     int vlen_enc = vector_length_encoding(this);
 9729     BasicType bt = Matcher::vector_element_basic_type(this);
 9730     int opc = this->ideal_Opcode();
 9731     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9732                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9733   %}
 9734   ins_pipe( pipe_slow );
 9735 %}
 9736 
 9737 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9738   predicate(n->as_ShiftV()->is_var_shift());
 9739   match(Set dst (LShiftVS (Binary dst src2) mask));
 9740   match(Set dst (LShiftVI (Binary dst src2) mask));
 9741   match(Set dst (LShiftVL (Binary dst src2) mask));
 9742   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9743   ins_encode %{
 9744     int vlen_enc = vector_length_encoding(this);
 9745     BasicType bt = Matcher::vector_element_basic_type(this);
 9746     int opc = this->ideal_Opcode();
 9747     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9748                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9749   %}
 9750   ins_pipe( pipe_slow );
 9751 %}
 9752 
 9753 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9754   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9755   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9756   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9757   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9758   ins_encode %{
 9759     int vlen_enc = vector_length_encoding(this);
 9760     BasicType bt = Matcher::vector_element_basic_type(this);
 9761     int opc = this->ideal_Opcode();
 9762     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9763                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9764   %}
 9765   ins_pipe( pipe_slow );
 9766 %}
 9767 
 9768 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9769   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9770   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9771   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9772   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9773   ins_encode %{
 9774     int vlen_enc = vector_length_encoding(this);
 9775     BasicType bt = Matcher::vector_element_basic_type(this);
 9776     int opc = this->ideal_Opcode();
 9777     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9778                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9779   %}
 9780   ins_pipe( pipe_slow );
 9781 %}
 9782 
 9783 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9784   predicate(!n->as_ShiftV()->is_var_shift());
 9785   match(Set dst (RShiftVS (Binary dst src2) mask));
 9786   match(Set dst (RShiftVI (Binary dst src2) mask));
 9787   match(Set dst (RShiftVL (Binary dst src2) mask));
 9788   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9789   ins_encode %{
 9790     int vlen_enc = vector_length_encoding(this);
 9791     BasicType bt = Matcher::vector_element_basic_type(this);
 9792     int opc = this->ideal_Opcode();
 9793     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9794                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9795   %}
 9796   ins_pipe( pipe_slow );
 9797 %}
 9798 
 9799 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9800   predicate(n->as_ShiftV()->is_var_shift());
 9801   match(Set dst (RShiftVS (Binary dst src2) mask));
 9802   match(Set dst (RShiftVI (Binary dst src2) mask));
 9803   match(Set dst (RShiftVL (Binary dst src2) mask));
 9804   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9805   ins_encode %{
 9806     int vlen_enc = vector_length_encoding(this);
 9807     BasicType bt = Matcher::vector_element_basic_type(this);
 9808     int opc = this->ideal_Opcode();
 9809     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9810                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9811   %}
 9812   ins_pipe( pipe_slow );
 9813 %}
 9814 
 9815 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9816   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9817   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9818   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9819   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9820   ins_encode %{
 9821     int vlen_enc = vector_length_encoding(this);
 9822     BasicType bt = Matcher::vector_element_basic_type(this);
 9823     int opc = this->ideal_Opcode();
 9824     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9825                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9826   %}
 9827   ins_pipe( pipe_slow );
 9828 %}
 9829 
 9830 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9831   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9832   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9833   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9834   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9835   ins_encode %{
 9836     int vlen_enc = vector_length_encoding(this);
 9837     BasicType bt = Matcher::vector_element_basic_type(this);
 9838     int opc = this->ideal_Opcode();
 9839     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9840                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9841   %}
 9842   ins_pipe( pipe_slow );
 9843 %}
 9844 
 9845 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9846   predicate(!n->as_ShiftV()->is_var_shift());
 9847   match(Set dst (URShiftVS (Binary dst src2) mask));
 9848   match(Set dst (URShiftVI (Binary dst src2) mask));
 9849   match(Set dst (URShiftVL (Binary dst src2) mask));
 9850   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9851   ins_encode %{
 9852     int vlen_enc = vector_length_encoding(this);
 9853     BasicType bt = Matcher::vector_element_basic_type(this);
 9854     int opc = this->ideal_Opcode();
 9855     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9856                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9857   %}
 9858   ins_pipe( pipe_slow );
 9859 %}
 9860 
 9861 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9862   predicate(n->as_ShiftV()->is_var_shift());
 9863   match(Set dst (URShiftVS (Binary dst src2) mask));
 9864   match(Set dst (URShiftVI (Binary dst src2) mask));
 9865   match(Set dst (URShiftVL (Binary dst src2) mask));
 9866   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9867   ins_encode %{
 9868     int vlen_enc = vector_length_encoding(this);
 9869     BasicType bt = Matcher::vector_element_basic_type(this);
 9870     int opc = this->ideal_Opcode();
 9871     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9872                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9873   %}
 9874   ins_pipe( pipe_slow );
 9875 %}
 9876 
 9877 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9878   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9879   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9880   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9881   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9882   ins_encode %{
 9883     int vlen_enc = vector_length_encoding(this);
 9884     BasicType bt = Matcher::vector_element_basic_type(this);
 9885     int opc = this->ideal_Opcode();
 9886     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9887                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9888   %}
 9889   ins_pipe( pipe_slow );
 9890 %}
 9891 
 9892 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9893   match(Set dst (MaxV (Binary dst src2) mask));
 9894   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9895   ins_encode %{
 9896     int vlen_enc = vector_length_encoding(this);
 9897     BasicType bt = Matcher::vector_element_basic_type(this);
 9898     int opc = this->ideal_Opcode();
 9899     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9900                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9901   %}
 9902   ins_pipe( pipe_slow );
 9903 %}
 9904 
 9905 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9906   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9907   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9908   ins_encode %{
 9909     int vlen_enc = vector_length_encoding(this);
 9910     BasicType bt = Matcher::vector_element_basic_type(this);
 9911     int opc = this->ideal_Opcode();
 9912     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9913                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9914   %}
 9915   ins_pipe( pipe_slow );
 9916 %}
 9917 
 9918 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9919   match(Set dst (MinV (Binary dst src2) mask));
 9920   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9921   ins_encode %{
 9922     int vlen_enc = vector_length_encoding(this);
 9923     BasicType bt = Matcher::vector_element_basic_type(this);
 9924     int opc = this->ideal_Opcode();
 9925     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9926                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9927   %}
 9928   ins_pipe( pipe_slow );
 9929 %}
 9930 
 9931 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9932   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9933   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9934   ins_encode %{
 9935     int vlen_enc = vector_length_encoding(this);
 9936     BasicType bt = Matcher::vector_element_basic_type(this);
 9937     int opc = this->ideal_Opcode();
 9938     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9939                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9940   %}
 9941   ins_pipe( pipe_slow );
 9942 %}
 9943 
 9944 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9945   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9946   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9947   ins_encode %{
 9948     int vlen_enc = vector_length_encoding(this);
 9949     BasicType bt = Matcher::vector_element_basic_type(this);
 9950     int opc = this->ideal_Opcode();
 9951     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9952                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9953   %}
 9954   ins_pipe( pipe_slow );
 9955 %}
 9956 
 9957 instruct vabs_masked(vec dst, kReg mask) %{
 9958   match(Set dst (AbsVB dst mask));
 9959   match(Set dst (AbsVS dst mask));
 9960   match(Set dst (AbsVI dst mask));
 9961   match(Set dst (AbsVL dst mask));
 9962   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9963   ins_encode %{
 9964     int vlen_enc = vector_length_encoding(this);
 9965     BasicType bt = Matcher::vector_element_basic_type(this);
 9966     int opc = this->ideal_Opcode();
 9967     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9968                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9969   %}
 9970   ins_pipe( pipe_slow );
 9971 %}
 9972 
 9973 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9974   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9975   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9976   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9977   ins_encode %{
 9978     int vlen_enc = vector_length_encoding(this);
 9979     BasicType bt = Matcher::vector_element_basic_type(this);
 9980     int opc = this->ideal_Opcode();
 9981     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9982                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9983   %}
 9984   ins_pipe( pipe_slow );
 9985 %}
 9986 
 9987 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9988   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9989   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9990   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9991   ins_encode %{
 9992     int vlen_enc = vector_length_encoding(this);
 9993     BasicType bt = Matcher::vector_element_basic_type(this);
 9994     int opc = this->ideal_Opcode();
 9995     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9996                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9997   %}
 9998   ins_pipe( pipe_slow );
 9999 %}
10000 
10001 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10002   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10003   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10004   ins_encode %{
10005     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10006     int vlen_enc = vector_length_encoding(this, $src1);
10007     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10008 
10009     // Comparison i
10010     switch (src1_elem_bt) {
10011       case T_BYTE: {
10012         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10013         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10014         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10015         break;
10016       }
10017       case T_SHORT: {
10018         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10019         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10020         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10021         break;
10022       }
10023       case T_INT: {
10024         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10025         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10026         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10027         break;
10028       }
10029       case T_LONG: {
10030         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10031         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10032         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10033         break;
10034       }
10035       case T_FLOAT: {
10036         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10037         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10038         break;
10039       }
10040       case T_DOUBLE: {
10041         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10042         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10043         break;
10044       }
10045       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10046     }
10047   %}
10048   ins_pipe( pipe_slow );
10049 %}
10050 
10051 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10052   predicate(Matcher::vector_length(n) <= 32);
10053   match(Set dst (MaskAll src));
10054   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10055   ins_encode %{
10056     int mask_len = Matcher::vector_length(this);
10057     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10058   %}
10059   ins_pipe( pipe_slow );
10060 %}
10061 
10062 #ifdef _LP64
10063 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10064   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10065   match(Set dst (XorVMask src (MaskAll cnt)));
10066   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10067   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10068   ins_encode %{
10069     uint masklen = Matcher::vector_length(this);
10070     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10071   %}
10072   ins_pipe( pipe_slow );
10073 %}
10074 
10075 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10076   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10077             (Matcher::vector_length(n) == 16) ||
10078             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10079   match(Set dst (XorVMask src (MaskAll cnt)));
10080   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10081   ins_encode %{
10082     uint masklen = Matcher::vector_length(this);
10083     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10084   %}
10085   ins_pipe( pipe_slow );
10086 %}
10087 
10088 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10089   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
10090   match(Set dst (VectorLongToMask src));
10091   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10092   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10093   ins_encode %{
10094     int mask_len = Matcher::vector_length(this);
10095     int vec_enc  = vector_length_encoding(mask_len);
10096     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10097                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10098   %}
10099   ins_pipe( pipe_slow );
10100 %}
10101 
10102 
10103 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10104   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
10105   match(Set dst (VectorLongToMask src));
10106   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10107   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10108   ins_encode %{
10109     int mask_len = Matcher::vector_length(this);
10110     assert(mask_len <= 32, "invalid mask length");
10111     int vec_enc  = vector_length_encoding(mask_len);
10112     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10113                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10114   %}
10115   ins_pipe( pipe_slow );
10116 %}
10117 
10118 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10119   predicate(n->bottom_type()->isa_vectmask());
10120   match(Set dst (VectorLongToMask src));
10121   format %{ "long_to_mask_evex $dst, $src\t!" %}
10122   ins_encode %{
10123     __ kmov($dst$$KRegister, $src$$Register);
10124   %}
10125   ins_pipe( pipe_slow );
10126 %}
10127 #endif
10128 
10129 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10130   match(Set dst (AndVMask src1 src2));
10131   match(Set dst (OrVMask src1 src2));
10132   match(Set dst (XorVMask src1 src2));
10133   effect(TEMP kscratch);
10134   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10135   ins_encode %{
10136     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10137     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10138     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10139     uint masklen = Matcher::vector_length(this);
10140     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10141     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10142   %}
10143   ins_pipe( pipe_slow );
10144 %}
10145 
10146 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10147   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10148   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10149   ins_encode %{
10150     int vlen_enc = vector_length_encoding(this);
10151     BasicType bt = Matcher::vector_element_basic_type(this);
10152     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10153                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10154   %}
10155   ins_pipe( pipe_slow );
10156 %}
10157 
10158 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10159   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10160   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10161   ins_encode %{
10162     int vlen_enc = vector_length_encoding(this);
10163     BasicType bt = Matcher::vector_element_basic_type(this);
10164     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10165                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10166   %}
10167   ins_pipe( pipe_slow );
10168 %}
10169 
10170 instruct castMM(kReg dst)
10171 %{
10172   match(Set dst (CastVV dst));
10173 
10174   size(0);
10175   format %{ "# castVV of $dst" %}
10176   ins_encode(/* empty encoding */);
10177   ins_cost(0);
10178   ins_pipe(empty);
10179 %}
10180 
10181 instruct castVV(vec dst)
10182 %{
10183   match(Set dst (CastVV dst));
10184 
10185   size(0);
10186   format %{ "# castVV of $dst" %}
10187   ins_encode(/* empty encoding */);
10188   ins_cost(0);
10189   ins_pipe(empty);
10190 %}
10191 
10192 instruct castVVLeg(legVec dst)
10193 %{
10194   match(Set dst (CastVV dst));
10195 
10196   size(0);
10197   format %{ "# castVV of $dst" %}
10198   ins_encode(/* empty encoding */);
10199   ins_cost(0);
10200   ins_pipe(empty);
10201 %}
10202 
10203 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10204 %{
10205   match(Set dst (IsInfiniteF src));
10206   effect(TEMP ktmp, KILL cr);
10207   format %{ "float_class_check $dst, $src" %}
10208   ins_encode %{
10209     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10210     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10211   %}
10212   ins_pipe(pipe_slow);
10213 %}
10214 
10215 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10216 %{
10217   match(Set dst (IsInfiniteD src));
10218   effect(TEMP ktmp, KILL cr);
10219   format %{ "double_class_check $dst, $src" %}
10220   ins_encode %{
10221     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10222     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10223   %}
10224   ins_pipe(pipe_slow);
10225 %}
10226 
10227