1 //
    2 // Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM31 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_unsigned_booltest_pred(int bt) {
 1250   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
 1251 }
 1252 
 1253 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1254   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1255            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1256 }
 1257 
 1258 class Node::PD {
 1259 public:
 1260   enum NodeFlags {
 1261     Flag_intel_jcc_erratum = Node::_last_flag << 1,
 1262     _last_flag             = Flag_intel_jcc_erratum
 1263   };
 1264 };
 1265 
 1266 %} // end source_hpp
 1267 
 1268 source %{
 1269 
 1270 #include "opto/addnode.hpp"
 1271 #include "c2_intelJccErratum_x86.hpp"
 1272 
 1273 void PhaseOutput::pd_perform_mach_node_analysis() {
 1274   if (VM_Version::has_intel_jcc_erratum()) {
 1275     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1276     _buf_sizes._code += extra_padding;
 1277   }
 1278 }
 1279 
 1280 int MachNode::pd_alignment_required() const {
 1281   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1282     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1283     return IntelJccErratum::largest_jcc_size() + 1;
 1284   } else {
 1285     return 1;
 1286   }
 1287 }
 1288 
 1289 int MachNode::compute_padding(int current_offset) const {
 1290   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1291     Compile* C = Compile::current();
 1292     PhaseOutput* output = C->output();
 1293     Block* block = output->block();
 1294     int index = output->index();
 1295     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1296   } else {
 1297     return 0;
 1298   }
 1299 }
 1300 
 1301 // Emit exception handler code.
 1302 // Stuff framesize into a register and call a VM stub routine.
 1303 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1304 
 1305   // Note that the code buffer's insts_mark is always relative to insts.
 1306   // That's why we must use the macroassembler to generate a handler.
 1307   C2_MacroAssembler _masm(&cbuf);
 1308   address base = __ start_a_stub(size_exception_handler());
 1309   if (base == NULL) {
 1310     ciEnv::current()->record_failure("CodeCache is full");
 1311     return 0;  // CodeBuffer::expand failed
 1312   }
 1313   int offset = __ offset();
 1314   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1315   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1316   __ end_a_stub();
 1317   return offset;
 1318 }
 1319 
 1320 // Emit deopt handler code.
 1321 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1322 
 1323   // Note that the code buffer's insts_mark is always relative to insts.
 1324   // That's why we must use the macroassembler to generate a handler.
 1325   C2_MacroAssembler _masm(&cbuf);
 1326   address base = __ start_a_stub(size_deopt_handler());
 1327   if (base == NULL) {
 1328     ciEnv::current()->record_failure("CodeCache is full");
 1329     return 0;  // CodeBuffer::expand failed
 1330   }
 1331   int offset = __ offset();
 1332 
 1333 #ifdef _LP64
 1334   address the_pc = (address) __ pc();
 1335   Label next;
 1336   // push a "the_pc" on the stack without destroying any registers
 1337   // as they all may be live.
 1338 
 1339   // push address of "next"
 1340   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1341   __ bind(next);
 1342   // adjust it so it matches "the_pc"
 1343   __ subptr(Address(rsp, 0), __ offset() - offset);
 1344 #else
 1345   InternalAddress here(__ pc());
 1346   __ pushptr(here.addr(), noreg);
 1347 #endif
 1348 
 1349   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1350   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1351   __ end_a_stub();
 1352   return offset;
 1353 }
 1354 
 1355 Assembler::Width widthForType(BasicType bt) {
 1356   if (bt == T_BYTE) {
 1357     return Assembler::B;
 1358   } else if (bt == T_SHORT) {
 1359     return Assembler::W;
 1360   } else if (bt == T_INT) {
 1361     return Assembler::D;
 1362   } else {
 1363     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1364     return Assembler::Q;
 1365   }
 1366 }
 1367 
 1368 //=============================================================================
 1369 
 1370   // Float masks come from different places depending on platform.
 1371 #ifdef _LP64
 1372   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1373   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1374   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1375   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1376 #else
 1377   static address float_signmask()  { return (address)float_signmask_pool; }
 1378   static address float_signflip()  { return (address)float_signflip_pool; }
 1379   static address double_signmask() { return (address)double_signmask_pool; }
 1380   static address double_signflip() { return (address)double_signflip_pool; }
 1381 #endif
 1382   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1383   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1384   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1385   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1386   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1387   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1388   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1389   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1390   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1391   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1392   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1393   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1394   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1395 
 1396 //=============================================================================
 1397 const bool Matcher::match_rule_supported(int opcode) {
 1398   if (!has_match_rule(opcode)) {
 1399     return false; // no match rule present
 1400   }
 1401   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1402   switch (opcode) {
 1403     case Op_AbsVL:
 1404     case Op_StoreVectorScatter:
 1405       if (UseAVX < 3) {
 1406         return false;
 1407       }
 1408       break;
 1409     case Op_PopCountI:
 1410     case Op_PopCountL:
 1411       if (!UsePopCountInstruction) {
 1412         return false;
 1413       }
 1414       break;
 1415     case Op_PopCountVI:
 1416       if (UseAVX < 2) {
 1417         return false;
 1418       }
 1419       break;
 1420     case Op_PopCountVL:
 1421       if (UseAVX < 2) {
 1422         return false;
 1423       }
 1424       break;
 1425     case Op_MulVI:
 1426       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1427         return false;
 1428       }
 1429       break;
 1430     case Op_MulVL:
 1431       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1432         return false;
 1433       }
 1434       break;
 1435     case Op_MulReductionVL:
 1436       if (VM_Version::supports_avx512dq() == false) {
 1437         return false;
 1438       }
 1439       break;
 1440     case Op_AddReductionVL:
 1441       if (UseSSE < 2) { // requires at least SSE2
 1442         return false;
 1443       }
 1444       break;
 1445     case Op_AbsVB:
 1446     case Op_AbsVS:
 1447     case Op_AbsVI:
 1448     case Op_AddReductionVI:
 1449     case Op_AndReductionV:
 1450     case Op_OrReductionV:
 1451     case Op_XorReductionV:
 1452       if (UseSSE < 3) { // requires at least SSSE3
 1453         return false;
 1454       }
 1455       break;
 1456     case Op_VectorLoadShuffle:
 1457     case Op_VectorRearrange:
 1458     case Op_MulReductionVI:
 1459       if (UseSSE < 4) { // requires at least SSE4
 1460         return false;
 1461       }
 1462       break;
 1463     case Op_IsInfiniteF:
 1464     case Op_IsInfiniteD:
 1465       if (!VM_Version::supports_avx512dq()) {
 1466         return false;
 1467       }
 1468       break;
 1469     case Op_SqrtVD:
 1470     case Op_SqrtVF:
 1471     case Op_VectorMaskCmp:
 1472     case Op_VectorCastB2X:
 1473     case Op_VectorCastS2X:
 1474     case Op_VectorCastI2X:
 1475     case Op_VectorCastL2X:
 1476     case Op_VectorCastF2X:
 1477     case Op_VectorCastD2X:
 1478     case Op_VectorUCastB2X:
 1479     case Op_VectorUCastS2X:
 1480     case Op_VectorUCastI2X:
 1481       if (UseAVX < 1) { // enabled for AVX only
 1482         return false;
 1483       }
 1484       break;
 1485     case Op_PopulateIndex:
 1486       if (!is_LP64 || (UseAVX < 2)) {
 1487         return false;
 1488       }
 1489       break;
 1490     case Op_RoundVF:
 1491       if (UseAVX < 2) { // enabled for AVX2 only
 1492         return false;
 1493       }
 1494       break;
 1495     case Op_RoundVD:
 1496       if (UseAVX < 3) {
 1497         return false;  // enabled for AVX3 only
 1498       }
 1499       break;
 1500     case Op_CompareAndSwapL:
 1501 #ifdef _LP64
 1502     case Op_CompareAndSwapP:
 1503 #endif
 1504       if (!VM_Version::supports_cx8()) {
 1505         return false;
 1506       }
 1507       break;
 1508     case Op_CMoveVF:
 1509     case Op_CMoveVD:
 1510       if (UseAVX < 1) { // enabled for AVX only
 1511         return false;
 1512       }
 1513       break;
 1514     case Op_StrIndexOf:
 1515       if (!UseSSE42Intrinsics) {
 1516         return false;
 1517       }
 1518       break;
 1519     case Op_StrIndexOfChar:
 1520       if (!UseSSE42Intrinsics) {
 1521         return false;
 1522       }
 1523       break;
 1524     case Op_OnSpinWait:
 1525       if (VM_Version::supports_on_spin_wait() == false) {
 1526         return false;
 1527       }
 1528       break;
 1529     case Op_MulVB:
 1530     case Op_LShiftVB:
 1531     case Op_RShiftVB:
 1532     case Op_URShiftVB:
 1533     case Op_VectorInsert:
 1534     case Op_VectorLoadMask:
 1535     case Op_VectorStoreMask:
 1536     case Op_VectorBlend:
 1537       if (UseSSE < 4) {
 1538         return false;
 1539       }
 1540       break;
 1541 #ifdef _LP64
 1542     case Op_MaxD:
 1543     case Op_MaxF:
 1544     case Op_MinD:
 1545     case Op_MinF:
 1546       if (UseAVX < 1) { // enabled for AVX only
 1547         return false;
 1548       }
 1549       break;
 1550 #endif
 1551     case Op_CacheWB:
 1552     case Op_CacheWBPreSync:
 1553     case Op_CacheWBPostSync:
 1554       if (!VM_Version::supports_data_cache_line_flush()) {
 1555         return false;
 1556       }
 1557       break;
 1558     case Op_ExtractB:
 1559     case Op_ExtractL:
 1560     case Op_ExtractI:
 1561     case Op_RoundDoubleMode:
 1562       if (UseSSE < 4) {
 1563         return false;
 1564       }
 1565       break;
 1566     case Op_RoundDoubleModeV:
 1567       if (VM_Version::supports_avx() == false) {
 1568         return false; // 128bit vroundpd is not available
 1569       }
 1570       break;
 1571     case Op_LoadVectorGather:
 1572       if (UseAVX < 2) {
 1573         return false;
 1574       }
 1575       break;
 1576     case Op_FmaVD:
 1577     case Op_FmaVF:
 1578       if (!UseFMA) {
 1579         return false;
 1580       }
 1581       break;
 1582     case Op_MacroLogicV:
 1583       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1584         return false;
 1585       }
 1586       break;
 1587 
 1588     case Op_VectorCmpMasked:
 1589     case Op_VectorMaskGen:
 1590       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1591         return false;
 1592       }
 1593       break;
 1594     case Op_VectorMaskFirstTrue:
 1595     case Op_VectorMaskLastTrue:
 1596     case Op_VectorMaskTrueCount:
 1597     case Op_VectorMaskToLong:
 1598       if (!is_LP64 || UseAVX < 1) {
 1599          return false;
 1600       }
 1601       break;
 1602     case Op_RoundF:
 1603     case Op_RoundD:
 1604       if (!is_LP64) {
 1605         return false;
 1606       }
 1607       break;
 1608     case Op_CopySignD:
 1609     case Op_CopySignF:
 1610       if (UseAVX < 3 || !is_LP64)  {
 1611         return false;
 1612       }
 1613       if (!VM_Version::supports_avx512vl()) {
 1614         return false;
 1615       }
 1616       break;
 1617 #ifndef _LP64
 1618     case Op_AddReductionVF:
 1619     case Op_AddReductionVD:
 1620     case Op_MulReductionVF:
 1621     case Op_MulReductionVD:
 1622       if (UseSSE < 1) { // requires at least SSE
 1623         return false;
 1624       }
 1625       break;
 1626     case Op_MulAddVS2VI:
 1627     case Op_RShiftVL:
 1628     case Op_AbsVD:
 1629     case Op_NegVD:
 1630       if (UseSSE < 2) {
 1631         return false;
 1632       }
 1633       break;
 1634 #endif // !LP64
 1635     case Op_CompressBits:
 1636       if (!VM_Version::supports_bmi2() || (!is_LP64 && UseSSE < 2)) {
 1637         return false;
 1638       }
 1639       break;
 1640     case Op_ExpandBits:
 1641       if (!VM_Version::supports_bmi2() || (!is_LP64 && (UseSSE < 2 || !VM_Version::supports_bmi1()))) {
 1642         return false;
 1643       }
 1644       break;
 1645     case Op_SignumF:
 1646       if (UseSSE < 1) {
 1647         return false;
 1648       }
 1649       break;
 1650     case Op_SignumD:
 1651       if (UseSSE < 2) {
 1652         return false;
 1653       }
 1654       break;
 1655     case Op_CompressM:
 1656       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1657         return false;
 1658       }
 1659       break;
 1660     case Op_CompressV:
 1661     case Op_ExpandV:
 1662       if (!VM_Version::supports_avx512vl()) {
 1663         return false;
 1664       }
 1665       break;
 1666     case Op_SqrtF:
 1667       if (UseSSE < 1) {
 1668         return false;
 1669       }
 1670       break;
 1671     case Op_SqrtD:
 1672 #ifdef _LP64
 1673       if (UseSSE < 2) {
 1674         return false;
 1675       }
 1676 #else
 1677       // x86_32.ad has a special match rule for SqrtD.
 1678       // Together with common x86 rules, this handles all UseSSE cases.
 1679 #endif
 1680       break;
 1681   }
 1682   return true;  // Match rules are supported by default.
 1683 }
 1684 
 1685 //------------------------------------------------------------------------
 1686 
 1687 static inline bool is_pop_count_instr_target(BasicType bt) {
 1688   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1689          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1690 }
 1691 
 1692 const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) {
 1693   return match_rule_supported_vector(opcode, vlen, bt);
 1694 }
 1695 
 1696 // Identify extra cases that we might want to provide match rules for vector nodes and
 1697 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1698 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1699   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1700   if (!match_rule_supported(opcode)) {
 1701     return false;
 1702   }
 1703   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1704   //   * SSE2 supports 128bit vectors for all types;
 1705   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1706   //   * AVX2 supports 256bit vectors for all types;
 1707   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1708   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1709   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1710   // And MaxVectorSize is taken into account as well.
 1711   if (!vector_size_supported(bt, vlen)) {
 1712     return false;
 1713   }
 1714   // Special cases which require vector length follow:
 1715   //   * implementation limitations
 1716   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1717   //   * 128bit vroundpd instruction is present only in AVX1
 1718   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1719   switch (opcode) {
 1720     case Op_AbsVF:
 1721     case Op_NegVF:
 1722       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1723         return false; // 512bit vandps and vxorps are not available
 1724       }
 1725       break;
 1726     case Op_AbsVD:
 1727     case Op_NegVD:
 1728     case Op_MulVL:
 1729       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1730         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1731       }
 1732       break;
 1733     case Op_CMoveVF:
 1734       if (vlen != 8) {
 1735         return false; // implementation limitation (only vcmov8F_reg is present)
 1736       }
 1737       break;
 1738     case Op_RotateRightV:
 1739     case Op_RotateLeftV:
 1740       if (bt != T_INT && bt != T_LONG) {
 1741         return false;
 1742       } // fallthrough
 1743     case Op_MacroLogicV:
 1744       if (!VM_Version::supports_evex() ||
 1745           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1746         return false;
 1747       }
 1748       break;
 1749     case Op_ClearArray:
 1750     case Op_VectorMaskGen:
 1751     case Op_VectorCmpMasked:
 1752       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1753         return false;
 1754       }
 1755       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1756         return false;
 1757       }
 1758       break;
 1759     case Op_LoadVectorMasked:
 1760     case Op_StoreVectorMasked:
 1761       if (!VM_Version::supports_avx512bw() && (is_subword_type(bt) || UseAVX < 1)) {
 1762         return false;
 1763       }
 1764       break;
 1765     case Op_CMoveVD:
 1766       if (vlen != 4) {
 1767         return false; // implementation limitation (only vcmov4D_reg is present)
 1768       }
 1769       break;
 1770     case Op_MaxV:
 1771     case Op_MinV:
 1772       if (UseSSE < 4 && is_integral_type(bt)) {
 1773         return false;
 1774       }
 1775       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1776           // Float/Double intrinsics are enabled for AVX family currently.
 1777           if (UseAVX == 0) {
 1778             return false;
 1779           }
 1780           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1781             return false;
 1782           }
 1783       }
 1784       break;
 1785     case Op_CallLeafVector:
 1786       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1787         return false;
 1788       }
 1789       break;
 1790     case Op_AddReductionVI:
 1791       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1792         return false;
 1793       }
 1794       // fallthrough
 1795     case Op_AndReductionV:
 1796     case Op_OrReductionV:
 1797     case Op_XorReductionV:
 1798       if (is_subword_type(bt) && (UseSSE < 4)) {
 1799         return false;
 1800       }
 1801 #ifndef _LP64
 1802       if (bt == T_BYTE || bt == T_LONG) {
 1803         return false;
 1804       }
 1805 #endif
 1806       break;
 1807 #ifndef _LP64
 1808     case Op_VectorInsert:
 1809       if (bt == T_LONG || bt == T_DOUBLE) {
 1810         return false;
 1811       }
 1812       break;
 1813 #endif
 1814     case Op_MinReductionV:
 1815     case Op_MaxReductionV:
 1816       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1817         return false;
 1818       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1819         return false;
 1820       }
 1821       // Float/Double intrinsics enabled for AVX family.
 1822       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1823         return false;
 1824       }
 1825       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1826         return false;
 1827       }
 1828 #ifndef _LP64
 1829       if (bt == T_BYTE || bt == T_LONG) {
 1830         return false;
 1831       }
 1832 #endif
 1833       break;
 1834     case Op_VectorTest:
 1835       if (UseSSE < 4) {
 1836         return false; // Implementation limitation
 1837       } else if (size_in_bits < 32) {
 1838         return false; // Implementation limitation
 1839       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
 1840         return false; // Implementation limitation
 1841       }
 1842       break;
 1843     case Op_VectorLoadShuffle:
 1844     case Op_VectorRearrange:
 1845       if(vlen == 2) {
 1846         return false; // Implementation limitation due to how shuffle is loaded
 1847       } else if (size_in_bits == 256 && UseAVX < 2) {
 1848         return false; // Implementation limitation
 1849       }
 1850       break;
 1851     case Op_VectorLoadMask:
 1852       if (size_in_bits == 256 && UseAVX < 2) {
 1853         return false; // Implementation limitation
 1854       }
 1855       // fallthrough
 1856     case Op_VectorStoreMask:
 1857       if (vlen == 2) {
 1858         return false; // Implementation limitation
 1859       }
 1860       break;
 1861     case Op_PopulateIndex:
 1862       if (size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
 1863         return false;
 1864       }
 1865       break;
 1866     case Op_VectorCastB2X:
 1867     case Op_VectorCastS2X:
 1868     case Op_VectorCastI2X:
 1869       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1870         return false;
 1871       }
 1872       break;
 1873     case Op_VectorCastL2X:
 1874       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1875         return false;
 1876       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1877         return false;
 1878       }
 1879       break;
 1880     case Op_VectorCastD2X:
 1881       // Conversion to integral type is only supported on AVX-512 platforms with avx512dq.
 1882       // Need avx512vl for size_in_bits < 512
 1883       if (is_integral_type(bt)) {
 1884         if (!VM_Version::supports_avx512dq()) {
 1885           return false;
 1886         }
 1887         if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1888           return false;
 1889         }
 1890       }
 1891       break;
 1892     case Op_RoundVD:
 1893       if (!VM_Version::supports_avx512dq()) {
 1894         return false;
 1895       }
 1896       break;
 1897     case Op_VectorCastF2X:
 1898       // F2I is supported on all AVX and above platforms
 1899       // For conversion to other integral types need AVX512:
 1900       //     Conversion to long in addition needs avx512dq
 1901       //     Need avx512vl for size_in_bits < 512
 1902       if (is_integral_type(bt) && (bt != T_INT)) {
 1903         if (UseAVX <= 2) {
 1904           return false;
 1905         }
 1906         if ((bt == T_LONG) && !VM_Version::supports_avx512dq()) {
 1907           return false;
 1908         }
 1909         if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1910           return false;
 1911         }
 1912       }
 1913       break;
 1914     case Op_MulReductionVI:
 1915       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1916         return false;
 1917       }
 1918       break;
 1919     case Op_LoadVectorGatherMasked:
 1920     case Op_StoreVectorScatterMasked:
 1921     case Op_StoreVectorScatter:
 1922       if (is_subword_type(bt)) {
 1923         return false;
 1924       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1925         return false;
 1926       }
 1927       // fallthrough
 1928     case Op_LoadVectorGather:
 1929       if (size_in_bits == 64 ) {
 1930         return false;
 1931       }
 1932       break;
 1933     case Op_MaskAll:
 1934       if (!VM_Version::supports_evex()) {
 1935         return false;
 1936       }
 1937       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1938         return false;
 1939       }
 1940       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1941         return false;
 1942       }
 1943       break;
 1944     case Op_VectorMaskCmp:
 1945       if (vlen < 2 || size_in_bits < 32) {
 1946         return false;
 1947       }
 1948       break;
 1949     case Op_CompressM:
 1950       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1951         return false;
 1952       }
 1953       break;
 1954     case Op_CompressV:
 1955     case Op_ExpandV:
 1956       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1957         return false;
 1958       }
 1959       if (size_in_bits < 128 ) {
 1960         return false;
 1961       }
 1962       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1963         return false;
 1964       }
 1965       break;
 1966     case Op_VectorLongToMask:
 1967       if (UseAVX < 1 || !is_LP64) {
 1968         return false;
 1969       }
 1970       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1971         return false;
 1972       }
 1973       break;
 1974     case Op_SignumVD:
 1975     case Op_SignumVF:
 1976       if (UseAVX < 1) {
 1977         return false;
 1978       }
 1979       break;
 1980     case Op_PopCountVI:
 1981     case Op_PopCountVL: {
 1982         if (!is_pop_count_instr_target(bt) &&
 1983             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1984           return false;
 1985         }
 1986       }
 1987       break;
 1988     case Op_ReverseV:
 1989     case Op_ReverseBytesV:
 1990       if (UseAVX < 2) {
 1991         return false;
 1992       }
 1993       break;
 1994     case Op_CountTrailingZerosV:
 1995     case Op_CountLeadingZerosV:
 1996       if (UseAVX < 2) {
 1997         return false;
 1998       }
 1999       break;
 2000   }
 2001   return true;  // Per default match rules are supported.
 2002 }
 2003 
 2004 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 2005   // ADLC based match_rule_supported routine checks for the existence of pattern based
 2006   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 2007   // of their non-masked counterpart with mask edge being the differentiator.
 2008   // This routine does a strict check on the existence of masked operation patterns
 2009   // by returning a default false value for all the other opcodes apart from the
 2010   // ones whose masked instruction patterns are defined in this file.
 2011   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 2012     return false;
 2013   }
 2014 
 2015   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 2016   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 2017   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 2018     return false;
 2019   }
 2020   switch(opcode) {
 2021     // Unary masked operations
 2022     case Op_AbsVB:
 2023     case Op_AbsVS:
 2024       if(!VM_Version::supports_avx512bw()) {
 2025         return false;  // Implementation limitation
 2026       }
 2027     case Op_AbsVI:
 2028     case Op_AbsVL:
 2029       return true;
 2030 
 2031     // Ternary masked operations
 2032     case Op_FmaVF:
 2033     case Op_FmaVD:
 2034       return true;
 2035 
 2036     case Op_MacroLogicV:
 2037       if(bt != T_INT && bt != T_LONG) {
 2038         return false;
 2039       }
 2040       return true;
 2041 
 2042     // Binary masked operations
 2043     case Op_AddVB:
 2044     case Op_AddVS:
 2045     case Op_SubVB:
 2046     case Op_SubVS:
 2047     case Op_MulVS:
 2048     case Op_LShiftVS:
 2049     case Op_RShiftVS:
 2050     case Op_URShiftVS:
 2051       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2052       if (!VM_Version::supports_avx512bw()) {
 2053         return false;  // Implementation limitation
 2054       }
 2055       return true;
 2056 
 2057     case Op_MulVL:
 2058       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2059       if (!VM_Version::supports_avx512dq()) {
 2060         return false;  // Implementation limitation
 2061       }
 2062       return true;
 2063 
 2064     case Op_AndV:
 2065     case Op_OrV:
 2066     case Op_XorV:
 2067     case Op_RotateRightV:
 2068     case Op_RotateLeftV:
 2069       if (bt != T_INT && bt != T_LONG) {
 2070         return false; // Implementation limitation
 2071       }
 2072       return true;
 2073 
 2074     case Op_VectorLoadMask:
 2075       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2076       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2077         return false;
 2078       }
 2079       return true;
 2080 
 2081     case Op_AddVI:
 2082     case Op_AddVL:
 2083     case Op_AddVF:
 2084     case Op_AddVD:
 2085     case Op_SubVI:
 2086     case Op_SubVL:
 2087     case Op_SubVF:
 2088     case Op_SubVD:
 2089     case Op_MulVI:
 2090     case Op_MulVF:
 2091     case Op_MulVD:
 2092     case Op_DivVF:
 2093     case Op_DivVD:
 2094     case Op_SqrtVF:
 2095     case Op_SqrtVD:
 2096     case Op_LShiftVI:
 2097     case Op_LShiftVL:
 2098     case Op_RShiftVI:
 2099     case Op_RShiftVL:
 2100     case Op_URShiftVI:
 2101     case Op_URShiftVL:
 2102     case Op_LoadVectorMasked:
 2103     case Op_StoreVectorMasked:
 2104     case Op_LoadVectorGatherMasked:
 2105     case Op_StoreVectorScatterMasked:
 2106       return true;
 2107 
 2108     case Op_MaxV:
 2109     case Op_MinV:
 2110       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2111         return false; // Implementation limitation
 2112       }
 2113       if (is_floating_point_type(bt)) {
 2114         return false; // Implementation limitation
 2115       }
 2116       return true;
 2117 
 2118     case Op_VectorMaskCmp:
 2119       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2120         return false; // Implementation limitation
 2121       }
 2122       return true;
 2123 
 2124     case Op_VectorRearrange:
 2125       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2126         return false; // Implementation limitation
 2127       }
 2128       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2129         return false; // Implementation limitation
 2130       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2131         return false; // Implementation limitation
 2132       }
 2133       return true;
 2134 
 2135     // Binary Logical operations
 2136     case Op_AndVMask:
 2137     case Op_OrVMask:
 2138     case Op_XorVMask:
 2139       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2140         return false; // Implementation limitation
 2141       }
 2142       return true;
 2143 
 2144     case Op_PopCountVI:
 2145     case Op_PopCountVL:
 2146       if (!is_pop_count_instr_target(bt)) {
 2147         return false;
 2148       }
 2149       return true;
 2150 
 2151     case Op_MaskAll:
 2152       return true;
 2153 
 2154     case Op_CountLeadingZerosV:
 2155       if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
 2156         return true;
 2157       }
 2158     default:
 2159       return false;
 2160   }
 2161 }
 2162 
 2163 const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
 2164   return false;
 2165 }
 2166 
 2167 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2168   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2169   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2170   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2171       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2172     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2173     return new legVecZOper();
 2174   }
 2175   if (legacy) {
 2176     switch (ideal_reg) {
 2177       case Op_VecS: return new legVecSOper();
 2178       case Op_VecD: return new legVecDOper();
 2179       case Op_VecX: return new legVecXOper();
 2180       case Op_VecY: return new legVecYOper();
 2181       case Op_VecZ: return new legVecZOper();
 2182     }
 2183   } else {
 2184     switch (ideal_reg) {
 2185       case Op_VecS: return new vecSOper();
 2186       case Op_VecD: return new vecDOper();
 2187       case Op_VecX: return new vecXOper();
 2188       case Op_VecY: return new vecYOper();
 2189       case Op_VecZ: return new vecZOper();
 2190     }
 2191   }
 2192   ShouldNotReachHere();
 2193   return NULL;
 2194 }
 2195 
 2196 bool Matcher::is_reg2reg_move(MachNode* m) {
 2197   switch (m->rule()) {
 2198     case MoveVec2Leg_rule:
 2199     case MoveLeg2Vec_rule:
 2200     case MoveF2VL_rule:
 2201     case MoveF2LEG_rule:
 2202     case MoveVL2F_rule:
 2203     case MoveLEG2F_rule:
 2204     case MoveD2VL_rule:
 2205     case MoveD2LEG_rule:
 2206     case MoveVL2D_rule:
 2207     case MoveLEG2D_rule:
 2208       return true;
 2209     default:
 2210       return false;
 2211   }
 2212 }
 2213 
 2214 bool Matcher::is_generic_vector(MachOper* opnd) {
 2215   switch (opnd->opcode()) {
 2216     case VEC:
 2217     case LEGVEC:
 2218       return true;
 2219     default:
 2220       return false;
 2221   }
 2222 }
 2223 
 2224 //------------------------------------------------------------------------
 2225 
 2226 const RegMask* Matcher::predicate_reg_mask(void) {
 2227   return &_VECTMASK_REG_mask;
 2228 }
 2229 
 2230 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2231   return new TypeVectMask(elemTy, length);
 2232 }
 2233 
 2234 // Max vector size in bytes. 0 if not supported.
 2235 const int Matcher::vector_width_in_bytes(BasicType bt) {
 2236   assert(is_java_primitive(bt), "only primitive type vectors");
 2237   if (UseSSE < 2) return 0;
 2238   // SSE2 supports 128bit vectors for all types.
 2239   // AVX2 supports 256bit vectors for all types.
 2240   // AVX2/EVEX supports 512bit vectors for all types.
 2241   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2242   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2243   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2244     size = (UseAVX > 2) ? 64 : 32;
 2245   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2246     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2247   // Use flag to limit vector size.
 2248   size = MIN2(size,(int)MaxVectorSize);
 2249   // Minimum 2 values in vector (or 4 for bytes).
 2250   switch (bt) {
 2251   case T_DOUBLE:
 2252   case T_LONG:
 2253     if (size < 16) return 0;
 2254     break;
 2255   case T_FLOAT:
 2256   case T_INT:
 2257     if (size < 8) return 0;
 2258     break;
 2259   case T_BOOLEAN:
 2260     if (size < 4) return 0;
 2261     break;
 2262   case T_CHAR:
 2263     if (size < 4) return 0;
 2264     break;
 2265   case T_BYTE:
 2266     if (size < 4) return 0;
 2267     break;
 2268   case T_SHORT:
 2269     if (size < 4) return 0;
 2270     break;
 2271   default:
 2272     ShouldNotReachHere();
 2273   }
 2274   return size;
 2275 }
 2276 
 2277 // Limits on vector size (number of elements) loaded into vector.
 2278 const int Matcher::max_vector_size(const BasicType bt) {
 2279   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2280 }
 2281 const int Matcher::min_vector_size(const BasicType bt) {
 2282   int max_size = max_vector_size(bt);
 2283   // Min size which can be loaded into vector is 4 bytes.
 2284   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2285   // Support for calling svml double64 vectors
 2286   if (bt == T_DOUBLE) {
 2287     size = 1;
 2288   }
 2289   return MIN2(size,max_size);
 2290 }
 2291 
 2292 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2293   return -1;
 2294 }
 2295 
 2296 // Vector ideal reg corresponding to specified size in bytes
 2297 const uint Matcher::vector_ideal_reg(int size) {
 2298   assert(MaxVectorSize >= size, "");
 2299   switch(size) {
 2300     case  4: return Op_VecS;
 2301     case  8: return Op_VecD;
 2302     case 16: return Op_VecX;
 2303     case 32: return Op_VecY;
 2304     case 64: return Op_VecZ;
 2305   }
 2306   ShouldNotReachHere();
 2307   return 0;
 2308 }
 2309 
 2310 // Check for shift by small constant as well
 2311 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2312   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2313       shift->in(2)->get_int() <= 3 &&
 2314       // Are there other uses besides address expressions?
 2315       !matcher->is_visited(shift)) {
 2316     address_visited.set(shift->_idx); // Flag as address_visited
 2317     mstack.push(shift->in(2), Matcher::Visit);
 2318     Node *conv = shift->in(1);
 2319 #ifdef _LP64
 2320     // Allow Matcher to match the rule which bypass
 2321     // ConvI2L operation for an array index on LP64
 2322     // if the index value is positive.
 2323     if (conv->Opcode() == Op_ConvI2L &&
 2324         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2325         // Are there other uses besides address expressions?
 2326         !matcher->is_visited(conv)) {
 2327       address_visited.set(conv->_idx); // Flag as address_visited
 2328       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2329     } else
 2330 #endif
 2331       mstack.push(conv, Matcher::Pre_Visit);
 2332     return true;
 2333   }
 2334   return false;
 2335 }
 2336 
 2337 // This function identifies sub-graphs in which a 'load' node is
 2338 // input to two different nodes, and such that it can be matched
 2339 // with BMI instructions like blsi, blsr, etc.
 2340 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2341 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2342 // refers to the same node.
 2343 //
 2344 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2345 // This is a temporary solution until we make DAGs expressible in ADL.
 2346 template<typename ConType>
 2347 class FusedPatternMatcher {
 2348   Node* _op1_node;
 2349   Node* _mop_node;
 2350   int _con_op;
 2351 
 2352   static int match_next(Node* n, int next_op, int next_op_idx) {
 2353     if (n->in(1) == NULL || n->in(2) == NULL) {
 2354       return -1;
 2355     }
 2356 
 2357     if (next_op_idx == -1) { // n is commutative, try rotations
 2358       if (n->in(1)->Opcode() == next_op) {
 2359         return 1;
 2360       } else if (n->in(2)->Opcode() == next_op) {
 2361         return 2;
 2362       }
 2363     } else {
 2364       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2365       if (n->in(next_op_idx)->Opcode() == next_op) {
 2366         return next_op_idx;
 2367       }
 2368     }
 2369     return -1;
 2370   }
 2371 
 2372  public:
 2373   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2374     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2375 
 2376   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2377              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2378              typename ConType::NativeType con_value) {
 2379     if (_op1_node->Opcode() != op1) {
 2380       return false;
 2381     }
 2382     if (_mop_node->outcnt() > 2) {
 2383       return false;
 2384     }
 2385     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2386     if (op1_op2_idx == -1) {
 2387       return false;
 2388     }
 2389     // Memory operation must be the other edge
 2390     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2391 
 2392     // Check that the mop node is really what we want
 2393     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2394       Node* op2_node = _op1_node->in(op1_op2_idx);
 2395       if (op2_node->outcnt() > 1) {
 2396         return false;
 2397       }
 2398       assert(op2_node->Opcode() == op2, "Should be");
 2399       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2400       if (op2_con_idx == -1) {
 2401         return false;
 2402       }
 2403       // Memory operation must be the other edge
 2404       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2405       // Check that the memory operation is the same node
 2406       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2407         // Now check the constant
 2408         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2409         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2410           return true;
 2411         }
 2412       }
 2413     }
 2414     return false;
 2415   }
 2416 };
 2417 
 2418 static bool is_bmi_pattern(Node* n, Node* m) {
 2419   assert(UseBMI1Instructions, "sanity");
 2420   if (n != NULL && m != NULL) {
 2421     if (m->Opcode() == Op_LoadI) {
 2422       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2423       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2424              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2425              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2426     } else if (m->Opcode() == Op_LoadL) {
 2427       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2428       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2429              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2430              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2431     }
 2432   }
 2433   return false;
 2434 }
 2435 
 2436 // Should the matcher clone input 'm' of node 'n'?
 2437 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2438   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2439   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2440     mstack.push(m, Visit);
 2441     return true;
 2442   }
 2443   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2444     mstack.push(m, Visit);           // m = ShiftCntV
 2445     return true;
 2446   }
 2447   return false;
 2448 }
 2449 
 2450 // Should the Matcher clone shifts on addressing modes, expecting them
 2451 // to be subsumed into complex addressing expressions or compute them
 2452 // into registers?
 2453 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2454   Node *off = m->in(AddPNode::Offset);
 2455   if (off->is_Con()) {
 2456     address_visited.test_set(m->_idx); // Flag as address_visited
 2457     Node *adr = m->in(AddPNode::Address);
 2458 
 2459     // Intel can handle 2 adds in addressing mode
 2460     // AtomicAdd is not an addressing expression.
 2461     // Cheap to find it by looking for screwy base.
 2462     if (adr->is_AddP() &&
 2463         !adr->in(AddPNode::Base)->is_top() &&
 2464         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2465         // Are there other uses besides address expressions?
 2466         !is_visited(adr)) {
 2467       address_visited.set(adr->_idx); // Flag as address_visited
 2468       Node *shift = adr->in(AddPNode::Offset);
 2469       if (!clone_shift(shift, this, mstack, address_visited)) {
 2470         mstack.push(shift, Pre_Visit);
 2471       }
 2472       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2473       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2474     } else {
 2475       mstack.push(adr, Pre_Visit);
 2476     }
 2477 
 2478     // Clone X+offset as it also folds into most addressing expressions
 2479     mstack.push(off, Visit);
 2480     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2481     return true;
 2482   } else if (clone_shift(off, this, mstack, address_visited)) {
 2483     address_visited.test_set(m->_idx); // Flag as address_visited
 2484     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2485     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2486     return true;
 2487   }
 2488   return false;
 2489 }
 2490 
 2491 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2492   switch (bt) {
 2493     case BoolTest::eq:
 2494       return Assembler::eq;
 2495     case BoolTest::ne:
 2496       return Assembler::neq;
 2497     case BoolTest::le:
 2498     case BoolTest::ule:
 2499       return Assembler::le;
 2500     case BoolTest::ge:
 2501     case BoolTest::uge:
 2502       return Assembler::nlt;
 2503     case BoolTest::lt:
 2504     case BoolTest::ult:
 2505       return Assembler::lt;
 2506     case BoolTest::gt:
 2507     case BoolTest::ugt:
 2508       return Assembler::nle;
 2509     default : ShouldNotReachHere(); return Assembler::_false;
 2510   }
 2511 }
 2512 
 2513 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2514   switch (bt) {
 2515   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2516   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2517   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2518   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2519   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2520   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2521   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2522   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2523   }
 2524 }
 2525 
 2526 // Helper methods for MachSpillCopyNode::implementation().
 2527 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2528                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2529   assert(ireg == Op_VecS || // 32bit vector
 2530          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2531          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2532          "no non-adjacent vector moves" );
 2533   if (cbuf) {
 2534     C2_MacroAssembler _masm(cbuf);
 2535     switch (ireg) {
 2536     case Op_VecS: // copy whole register
 2537     case Op_VecD:
 2538     case Op_VecX:
 2539 #ifndef _LP64
 2540       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2541 #else
 2542       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2543         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2544       } else {
 2545         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2546      }
 2547 #endif
 2548       break;
 2549     case Op_VecY:
 2550 #ifndef _LP64
 2551       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2552 #else
 2553       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2554         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2555       } else {
 2556         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2557      }
 2558 #endif
 2559       break;
 2560     case Op_VecZ:
 2561       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2562       break;
 2563     default:
 2564       ShouldNotReachHere();
 2565     }
 2566 #ifndef PRODUCT
 2567   } else {
 2568     switch (ireg) {
 2569     case Op_VecS:
 2570     case Op_VecD:
 2571     case Op_VecX:
 2572       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2573       break;
 2574     case Op_VecY:
 2575     case Op_VecZ:
 2576       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2577       break;
 2578     default:
 2579       ShouldNotReachHere();
 2580     }
 2581 #endif
 2582   }
 2583 }
 2584 
 2585 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2586                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2587   if (cbuf) {
 2588     C2_MacroAssembler _masm(cbuf);
 2589     if (is_load) {
 2590       switch (ireg) {
 2591       case Op_VecS:
 2592         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2593         break;
 2594       case Op_VecD:
 2595         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2596         break;
 2597       case Op_VecX:
 2598 #ifndef _LP64
 2599         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2600 #else
 2601         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2602           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2603         } else {
 2604           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2605           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2606         }
 2607 #endif
 2608         break;
 2609       case Op_VecY:
 2610 #ifndef _LP64
 2611         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2612 #else
 2613         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2614           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2615         } else {
 2616           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2617           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2618         }
 2619 #endif
 2620         break;
 2621       case Op_VecZ:
 2622         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2623         break;
 2624       default:
 2625         ShouldNotReachHere();
 2626       }
 2627     } else { // store
 2628       switch (ireg) {
 2629       case Op_VecS:
 2630         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2631         break;
 2632       case Op_VecD:
 2633         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2634         break;
 2635       case Op_VecX:
 2636 #ifndef _LP64
 2637         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2638 #else
 2639         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2640           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2641         }
 2642         else {
 2643           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2644         }
 2645 #endif
 2646         break;
 2647       case Op_VecY:
 2648 #ifndef _LP64
 2649         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2650 #else
 2651         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2652           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2653         }
 2654         else {
 2655           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2656         }
 2657 #endif
 2658         break;
 2659       case Op_VecZ:
 2660         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2661         break;
 2662       default:
 2663         ShouldNotReachHere();
 2664       }
 2665     }
 2666 #ifndef PRODUCT
 2667   } else {
 2668     if (is_load) {
 2669       switch (ireg) {
 2670       case Op_VecS:
 2671         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2672         break;
 2673       case Op_VecD:
 2674         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2675         break;
 2676        case Op_VecX:
 2677         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2678         break;
 2679       case Op_VecY:
 2680       case Op_VecZ:
 2681         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2682         break;
 2683       default:
 2684         ShouldNotReachHere();
 2685       }
 2686     } else { // store
 2687       switch (ireg) {
 2688       case Op_VecS:
 2689         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2690         break;
 2691       case Op_VecD:
 2692         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2693         break;
 2694        case Op_VecX:
 2695         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2696         break;
 2697       case Op_VecY:
 2698       case Op_VecZ:
 2699         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2700         break;
 2701       default:
 2702         ShouldNotReachHere();
 2703       }
 2704     }
 2705 #endif
 2706   }
 2707 }
 2708 
 2709 template <class T>
 2710 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2711   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2712   jvalue ele;
 2713   switch (bt) {
 2714     case T_BYTE:   ele.b = con; break;
 2715     case T_SHORT:  ele.s = con; break;
 2716     case T_INT:    ele.i = con; break;
 2717     case T_LONG:   ele.j = con; break;
 2718     case T_FLOAT:  ele.f = con; break;
 2719     case T_DOUBLE: ele.d = con; break;
 2720     default: ShouldNotReachHere();
 2721   }
 2722   for (int i = 0; i < len; i++) {
 2723     val->append(ele);
 2724   }
 2725   return val;
 2726 }
 2727 
 2728 static inline jlong high_bit_set(BasicType bt) {
 2729   switch (bt) {
 2730     case T_BYTE:  return 0x8080808080808080;
 2731     case T_SHORT: return 0x8000800080008000;
 2732     case T_INT:   return 0x8000000080000000;
 2733     case T_LONG:  return 0x8000000000000000;
 2734     default:
 2735       ShouldNotReachHere();
 2736       return 0;
 2737   }
 2738 }
 2739 
 2740 #ifndef PRODUCT
 2741   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2742     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2743   }
 2744 #endif
 2745 
 2746   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2747     C2_MacroAssembler _masm(&cbuf);
 2748     __ nop(_count);
 2749   }
 2750 
 2751   uint MachNopNode::size(PhaseRegAlloc*) const {
 2752     return _count;
 2753   }
 2754 
 2755 #ifndef PRODUCT
 2756   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2757     st->print("# breakpoint");
 2758   }
 2759 #endif
 2760 
 2761   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2762     C2_MacroAssembler _masm(&cbuf);
 2763     __ int3();
 2764   }
 2765 
 2766   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2767     return MachNode::size(ra_);
 2768   }
 2769 
 2770 %}
 2771 
 2772 encode %{
 2773 
 2774   enc_class call_epilog %{
 2775     C2_MacroAssembler _masm(&cbuf);
 2776     if (VerifyStackAtCalls) {
 2777       // Check that stack depth is unchanged: find majik cookie on stack
 2778       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2779       Label L;
 2780       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2781       __ jccb(Assembler::equal, L);
 2782       // Die if stack mismatch
 2783       __ int3();
 2784       __ bind(L);
 2785     }
 2786     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
 2787       C2_MacroAssembler _masm(&cbuf);
 2788       if (!_method->signature()->returns_null_free_inline_type()) {
 2789         // The last return value is not set by the callee but used to pass IsInit information to compiled code.
 2790         // Search for the corresponding projection, get the register and emit code that initialized it.
 2791         uint con = (tf()->range_cc()->cnt() - 1);
 2792         for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
 2793           ProjNode* proj = fast_out(i)->as_Proj();
 2794           if (proj->_con == con) {
 2795             // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
 2796             OptoReg::Name optoReg = ra_->get_reg_first(proj);
 2797             VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
 2798             Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
 2799             __ testq(rax, rax);
 2800             __ set_byte_if_not_zero(toReg);
 2801             __ movzbl(toReg, toReg);
 2802             if (reg->is_stack()) {
 2803               int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
 2804               __ movq(Address(rsp, st_off), toReg);
 2805             }
 2806             break;
 2807           }
 2808         }
 2809       }
 2810       if (return_value_is_used()) {
 2811         // An inline type is returned as fields in multiple registers.
 2812         // Rax either contains an oop if the inline type is buffered or a pointer
 2813         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
 2814         // if the lowest bit is set to allow C2 to use the oop after null checking.
 2815         // rax &= (rax & 1) - 1
 2816         __ movptr(rscratch1, rax);
 2817         __ andptr(rscratch1, 0x1);
 2818         __ subptr(rscratch1, 0x1);
 2819         __ andptr(rax, rscratch1);
 2820       }
 2821     }
 2822   %}
 2823 
 2824 %}
 2825 
 2826 // Operands for bound floating pointer register arguments
 2827 operand rxmm0() %{
 2828   constraint(ALLOC_IN_RC(xmm0_reg));
 2829   match(VecX);
 2830   format%{%}
 2831   interface(REG_INTER);
 2832 %}
 2833 
 2834 //----------OPERANDS-----------------------------------------------------------
 2835 // Operand definitions must precede instruction definitions for correct parsing
 2836 // in the ADLC because operands constitute user defined types which are used in
 2837 // instruction definitions.
 2838 
 2839 // Vectors
 2840 
 2841 // Dummy generic vector class. Should be used for all vector operands.
 2842 // Replaced with vec[SDXYZ] during post-selection pass.
 2843 operand vec() %{
 2844   constraint(ALLOC_IN_RC(dynamic));
 2845   match(VecX);
 2846   match(VecY);
 2847   match(VecZ);
 2848   match(VecS);
 2849   match(VecD);
 2850 
 2851   format %{ %}
 2852   interface(REG_INTER);
 2853 %}
 2854 
 2855 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2856 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2857 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2858 // runtime code generation via reg_class_dynamic.
 2859 operand legVec() %{
 2860   constraint(ALLOC_IN_RC(dynamic));
 2861   match(VecX);
 2862   match(VecY);
 2863   match(VecZ);
 2864   match(VecS);
 2865   match(VecD);
 2866 
 2867   format %{ %}
 2868   interface(REG_INTER);
 2869 %}
 2870 
 2871 // Replaces vec during post-selection cleanup. See above.
 2872 operand vecS() %{
 2873   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2874   match(VecS);
 2875 
 2876   format %{ %}
 2877   interface(REG_INTER);
 2878 %}
 2879 
 2880 // Replaces legVec during post-selection cleanup. See above.
 2881 operand legVecS() %{
 2882   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2883   match(VecS);
 2884 
 2885   format %{ %}
 2886   interface(REG_INTER);
 2887 %}
 2888 
 2889 // Replaces vec during post-selection cleanup. See above.
 2890 operand vecD() %{
 2891   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2892   match(VecD);
 2893 
 2894   format %{ %}
 2895   interface(REG_INTER);
 2896 %}
 2897 
 2898 // Replaces legVec during post-selection cleanup. See above.
 2899 operand legVecD() %{
 2900   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2901   match(VecD);
 2902 
 2903   format %{ %}
 2904   interface(REG_INTER);
 2905 %}
 2906 
 2907 // Replaces vec during post-selection cleanup. See above.
 2908 operand vecX() %{
 2909   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2910   match(VecX);
 2911 
 2912   format %{ %}
 2913   interface(REG_INTER);
 2914 %}
 2915 
 2916 // Replaces legVec during post-selection cleanup. See above.
 2917 operand legVecX() %{
 2918   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2919   match(VecX);
 2920 
 2921   format %{ %}
 2922   interface(REG_INTER);
 2923 %}
 2924 
 2925 // Replaces vec during post-selection cleanup. See above.
 2926 operand vecY() %{
 2927   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2928   match(VecY);
 2929 
 2930   format %{ %}
 2931   interface(REG_INTER);
 2932 %}
 2933 
 2934 // Replaces legVec during post-selection cleanup. See above.
 2935 operand legVecY() %{
 2936   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2937   match(VecY);
 2938 
 2939   format %{ %}
 2940   interface(REG_INTER);
 2941 %}
 2942 
 2943 // Replaces vec during post-selection cleanup. See above.
 2944 operand vecZ() %{
 2945   constraint(ALLOC_IN_RC(vectorz_reg));
 2946   match(VecZ);
 2947 
 2948   format %{ %}
 2949   interface(REG_INTER);
 2950 %}
 2951 
 2952 // Replaces legVec during post-selection cleanup. See above.
 2953 operand legVecZ() %{
 2954   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2955   match(VecZ);
 2956 
 2957   format %{ %}
 2958   interface(REG_INTER);
 2959 %}
 2960 
 2961 // Comparison Code for FP conditional move
 2962 operand cmpOp_vcmppd() %{
 2963   match(Bool);
 2964 
 2965   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
 2966             n->as_Bool()->_test._test != BoolTest::no_overflow);
 2967   format %{ "" %}
 2968   interface(COND_INTER) %{
 2969     equal        (0x0, "eq");
 2970     less         (0x1, "lt");
 2971     less_equal   (0x2, "le");
 2972     not_equal    (0xC, "ne");
 2973     greater_equal(0xD, "ge");
 2974     greater      (0xE, "gt");
 2975     //TODO cannot compile (adlc breaks) without two next lines with error:
 2976     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
 2977     // equal' for overflow.
 2978     overflow     (0x20, "o");  // not really supported by the instruction
 2979     no_overflow  (0x21, "no"); // not really supported by the instruction
 2980   %}
 2981 %}
 2982 
 2983 
 2984 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2985 
 2986 // ============================================================================
 2987 
 2988 instruct ShouldNotReachHere() %{
 2989   match(Halt);
 2990   format %{ "stop\t# ShouldNotReachHere" %}
 2991   ins_encode %{
 2992     if (is_reachable()) {
 2993       __ stop(_halt_reason);
 2994     }
 2995   %}
 2996   ins_pipe(pipe_slow);
 2997 %}
 2998 
 2999 // ============================================================================
 3000 
 3001 instruct addF_reg(regF dst, regF src) %{
 3002   predicate((UseSSE>=1) && (UseAVX == 0));
 3003   match(Set dst (AddF dst src));
 3004 
 3005   format %{ "addss   $dst, $src" %}
 3006   ins_cost(150);
 3007   ins_encode %{
 3008     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 3009   %}
 3010   ins_pipe(pipe_slow);
 3011 %}
 3012 
 3013 instruct addF_mem(regF dst, memory src) %{
 3014   predicate((UseSSE>=1) && (UseAVX == 0));
 3015   match(Set dst (AddF dst (LoadF src)));
 3016 
 3017   format %{ "addss   $dst, $src" %}
 3018   ins_cost(150);
 3019   ins_encode %{
 3020     __ addss($dst$$XMMRegister, $src$$Address);
 3021   %}
 3022   ins_pipe(pipe_slow);
 3023 %}
 3024 
 3025 instruct addF_imm(regF dst, immF con) %{
 3026   predicate((UseSSE>=1) && (UseAVX == 0));
 3027   match(Set dst (AddF dst con));
 3028   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3029   ins_cost(150);
 3030   ins_encode %{
 3031     __ addss($dst$$XMMRegister, $constantaddress($con));
 3032   %}
 3033   ins_pipe(pipe_slow);
 3034 %}
 3035 
 3036 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 3037   predicate(UseAVX > 0);
 3038   match(Set dst (AddF src1 src2));
 3039 
 3040   format %{ "vaddss  $dst, $src1, $src2" %}
 3041   ins_cost(150);
 3042   ins_encode %{
 3043     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3044   %}
 3045   ins_pipe(pipe_slow);
 3046 %}
 3047 
 3048 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 3049   predicate(UseAVX > 0);
 3050   match(Set dst (AddF src1 (LoadF src2)));
 3051 
 3052   format %{ "vaddss  $dst, $src1, $src2" %}
 3053   ins_cost(150);
 3054   ins_encode %{
 3055     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3056   %}
 3057   ins_pipe(pipe_slow);
 3058 %}
 3059 
 3060 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 3061   predicate(UseAVX > 0);
 3062   match(Set dst (AddF src con));
 3063 
 3064   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3065   ins_cost(150);
 3066   ins_encode %{
 3067     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3068   %}
 3069   ins_pipe(pipe_slow);
 3070 %}
 3071 
 3072 instruct addD_reg(regD dst, regD src) %{
 3073   predicate((UseSSE>=2) && (UseAVX == 0));
 3074   match(Set dst (AddD dst src));
 3075 
 3076   format %{ "addsd   $dst, $src" %}
 3077   ins_cost(150);
 3078   ins_encode %{
 3079     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 3080   %}
 3081   ins_pipe(pipe_slow);
 3082 %}
 3083 
 3084 instruct addD_mem(regD dst, memory src) %{
 3085   predicate((UseSSE>=2) && (UseAVX == 0));
 3086   match(Set dst (AddD dst (LoadD src)));
 3087 
 3088   format %{ "addsd   $dst, $src" %}
 3089   ins_cost(150);
 3090   ins_encode %{
 3091     __ addsd($dst$$XMMRegister, $src$$Address);
 3092   %}
 3093   ins_pipe(pipe_slow);
 3094 %}
 3095 
 3096 instruct addD_imm(regD dst, immD con) %{
 3097   predicate((UseSSE>=2) && (UseAVX == 0));
 3098   match(Set dst (AddD dst con));
 3099   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3100   ins_cost(150);
 3101   ins_encode %{
 3102     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3103   %}
 3104   ins_pipe(pipe_slow);
 3105 %}
 3106 
 3107 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3108   predicate(UseAVX > 0);
 3109   match(Set dst (AddD src1 src2));
 3110 
 3111   format %{ "vaddsd  $dst, $src1, $src2" %}
 3112   ins_cost(150);
 3113   ins_encode %{
 3114     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3115   %}
 3116   ins_pipe(pipe_slow);
 3117 %}
 3118 
 3119 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3120   predicate(UseAVX > 0);
 3121   match(Set dst (AddD src1 (LoadD src2)));
 3122 
 3123   format %{ "vaddsd  $dst, $src1, $src2" %}
 3124   ins_cost(150);
 3125   ins_encode %{
 3126     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3127   %}
 3128   ins_pipe(pipe_slow);
 3129 %}
 3130 
 3131 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3132   predicate(UseAVX > 0);
 3133   match(Set dst (AddD src con));
 3134 
 3135   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3136   ins_cost(150);
 3137   ins_encode %{
 3138     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3139   %}
 3140   ins_pipe(pipe_slow);
 3141 %}
 3142 
 3143 instruct subF_reg(regF dst, regF src) %{
 3144   predicate((UseSSE>=1) && (UseAVX == 0));
 3145   match(Set dst (SubF dst src));
 3146 
 3147   format %{ "subss   $dst, $src" %}
 3148   ins_cost(150);
 3149   ins_encode %{
 3150     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3151   %}
 3152   ins_pipe(pipe_slow);
 3153 %}
 3154 
 3155 instruct subF_mem(regF dst, memory src) %{
 3156   predicate((UseSSE>=1) && (UseAVX == 0));
 3157   match(Set dst (SubF dst (LoadF src)));
 3158 
 3159   format %{ "subss   $dst, $src" %}
 3160   ins_cost(150);
 3161   ins_encode %{
 3162     __ subss($dst$$XMMRegister, $src$$Address);
 3163   %}
 3164   ins_pipe(pipe_slow);
 3165 %}
 3166 
 3167 instruct subF_imm(regF dst, immF con) %{
 3168   predicate((UseSSE>=1) && (UseAVX == 0));
 3169   match(Set dst (SubF dst con));
 3170   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3171   ins_cost(150);
 3172   ins_encode %{
 3173     __ subss($dst$$XMMRegister, $constantaddress($con));
 3174   %}
 3175   ins_pipe(pipe_slow);
 3176 %}
 3177 
 3178 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3179   predicate(UseAVX > 0);
 3180   match(Set dst (SubF src1 src2));
 3181 
 3182   format %{ "vsubss  $dst, $src1, $src2" %}
 3183   ins_cost(150);
 3184   ins_encode %{
 3185     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3186   %}
 3187   ins_pipe(pipe_slow);
 3188 %}
 3189 
 3190 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3191   predicate(UseAVX > 0);
 3192   match(Set dst (SubF src1 (LoadF src2)));
 3193 
 3194   format %{ "vsubss  $dst, $src1, $src2" %}
 3195   ins_cost(150);
 3196   ins_encode %{
 3197     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3198   %}
 3199   ins_pipe(pipe_slow);
 3200 %}
 3201 
 3202 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3203   predicate(UseAVX > 0);
 3204   match(Set dst (SubF src con));
 3205 
 3206   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3207   ins_cost(150);
 3208   ins_encode %{
 3209     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3210   %}
 3211   ins_pipe(pipe_slow);
 3212 %}
 3213 
 3214 instruct subD_reg(regD dst, regD src) %{
 3215   predicate((UseSSE>=2) && (UseAVX == 0));
 3216   match(Set dst (SubD dst src));
 3217 
 3218   format %{ "subsd   $dst, $src" %}
 3219   ins_cost(150);
 3220   ins_encode %{
 3221     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3222   %}
 3223   ins_pipe(pipe_slow);
 3224 %}
 3225 
 3226 instruct subD_mem(regD dst, memory src) %{
 3227   predicate((UseSSE>=2) && (UseAVX == 0));
 3228   match(Set dst (SubD dst (LoadD src)));
 3229 
 3230   format %{ "subsd   $dst, $src" %}
 3231   ins_cost(150);
 3232   ins_encode %{
 3233     __ subsd($dst$$XMMRegister, $src$$Address);
 3234   %}
 3235   ins_pipe(pipe_slow);
 3236 %}
 3237 
 3238 instruct subD_imm(regD dst, immD con) %{
 3239   predicate((UseSSE>=2) && (UseAVX == 0));
 3240   match(Set dst (SubD dst con));
 3241   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3242   ins_cost(150);
 3243   ins_encode %{
 3244     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3245   %}
 3246   ins_pipe(pipe_slow);
 3247 %}
 3248 
 3249 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3250   predicate(UseAVX > 0);
 3251   match(Set dst (SubD src1 src2));
 3252 
 3253   format %{ "vsubsd  $dst, $src1, $src2" %}
 3254   ins_cost(150);
 3255   ins_encode %{
 3256     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3257   %}
 3258   ins_pipe(pipe_slow);
 3259 %}
 3260 
 3261 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3262   predicate(UseAVX > 0);
 3263   match(Set dst (SubD src1 (LoadD src2)));
 3264 
 3265   format %{ "vsubsd  $dst, $src1, $src2" %}
 3266   ins_cost(150);
 3267   ins_encode %{
 3268     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3269   %}
 3270   ins_pipe(pipe_slow);
 3271 %}
 3272 
 3273 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3274   predicate(UseAVX > 0);
 3275   match(Set dst (SubD src con));
 3276 
 3277   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3278   ins_cost(150);
 3279   ins_encode %{
 3280     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3281   %}
 3282   ins_pipe(pipe_slow);
 3283 %}
 3284 
 3285 instruct mulF_reg(regF dst, regF src) %{
 3286   predicate((UseSSE>=1) && (UseAVX == 0));
 3287   match(Set dst (MulF dst src));
 3288 
 3289   format %{ "mulss   $dst, $src" %}
 3290   ins_cost(150);
 3291   ins_encode %{
 3292     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3293   %}
 3294   ins_pipe(pipe_slow);
 3295 %}
 3296 
 3297 instruct mulF_mem(regF dst, memory src) %{
 3298   predicate((UseSSE>=1) && (UseAVX == 0));
 3299   match(Set dst (MulF dst (LoadF src)));
 3300 
 3301   format %{ "mulss   $dst, $src" %}
 3302   ins_cost(150);
 3303   ins_encode %{
 3304     __ mulss($dst$$XMMRegister, $src$$Address);
 3305   %}
 3306   ins_pipe(pipe_slow);
 3307 %}
 3308 
 3309 instruct mulF_imm(regF dst, immF con) %{
 3310   predicate((UseSSE>=1) && (UseAVX == 0));
 3311   match(Set dst (MulF dst con));
 3312   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3313   ins_cost(150);
 3314   ins_encode %{
 3315     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3316   %}
 3317   ins_pipe(pipe_slow);
 3318 %}
 3319 
 3320 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3321   predicate(UseAVX > 0);
 3322   match(Set dst (MulF src1 src2));
 3323 
 3324   format %{ "vmulss  $dst, $src1, $src2" %}
 3325   ins_cost(150);
 3326   ins_encode %{
 3327     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3328   %}
 3329   ins_pipe(pipe_slow);
 3330 %}
 3331 
 3332 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3333   predicate(UseAVX > 0);
 3334   match(Set dst (MulF src1 (LoadF src2)));
 3335 
 3336   format %{ "vmulss  $dst, $src1, $src2" %}
 3337   ins_cost(150);
 3338   ins_encode %{
 3339     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3340   %}
 3341   ins_pipe(pipe_slow);
 3342 %}
 3343 
 3344 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3345   predicate(UseAVX > 0);
 3346   match(Set dst (MulF src con));
 3347 
 3348   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3349   ins_cost(150);
 3350   ins_encode %{
 3351     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3352   %}
 3353   ins_pipe(pipe_slow);
 3354 %}
 3355 
 3356 instruct mulD_reg(regD dst, regD src) %{
 3357   predicate((UseSSE>=2) && (UseAVX == 0));
 3358   match(Set dst (MulD dst src));
 3359 
 3360   format %{ "mulsd   $dst, $src" %}
 3361   ins_cost(150);
 3362   ins_encode %{
 3363     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3364   %}
 3365   ins_pipe(pipe_slow);
 3366 %}
 3367 
 3368 instruct mulD_mem(regD dst, memory src) %{
 3369   predicate((UseSSE>=2) && (UseAVX == 0));
 3370   match(Set dst (MulD dst (LoadD src)));
 3371 
 3372   format %{ "mulsd   $dst, $src" %}
 3373   ins_cost(150);
 3374   ins_encode %{
 3375     __ mulsd($dst$$XMMRegister, $src$$Address);
 3376   %}
 3377   ins_pipe(pipe_slow);
 3378 %}
 3379 
 3380 instruct mulD_imm(regD dst, immD con) %{
 3381   predicate((UseSSE>=2) && (UseAVX == 0));
 3382   match(Set dst (MulD dst con));
 3383   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3384   ins_cost(150);
 3385   ins_encode %{
 3386     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3387   %}
 3388   ins_pipe(pipe_slow);
 3389 %}
 3390 
 3391 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3392   predicate(UseAVX > 0);
 3393   match(Set dst (MulD src1 src2));
 3394 
 3395   format %{ "vmulsd  $dst, $src1, $src2" %}
 3396   ins_cost(150);
 3397   ins_encode %{
 3398     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3399   %}
 3400   ins_pipe(pipe_slow);
 3401 %}
 3402 
 3403 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3404   predicate(UseAVX > 0);
 3405   match(Set dst (MulD src1 (LoadD src2)));
 3406 
 3407   format %{ "vmulsd  $dst, $src1, $src2" %}
 3408   ins_cost(150);
 3409   ins_encode %{
 3410     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3411   %}
 3412   ins_pipe(pipe_slow);
 3413 %}
 3414 
 3415 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3416   predicate(UseAVX > 0);
 3417   match(Set dst (MulD src con));
 3418 
 3419   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3420   ins_cost(150);
 3421   ins_encode %{
 3422     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3423   %}
 3424   ins_pipe(pipe_slow);
 3425 %}
 3426 
 3427 instruct divF_reg(regF dst, regF src) %{
 3428   predicate((UseSSE>=1) && (UseAVX == 0));
 3429   match(Set dst (DivF dst src));
 3430 
 3431   format %{ "divss   $dst, $src" %}
 3432   ins_cost(150);
 3433   ins_encode %{
 3434     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3435   %}
 3436   ins_pipe(pipe_slow);
 3437 %}
 3438 
 3439 instruct divF_mem(regF dst, memory src) %{
 3440   predicate((UseSSE>=1) && (UseAVX == 0));
 3441   match(Set dst (DivF dst (LoadF src)));
 3442 
 3443   format %{ "divss   $dst, $src" %}
 3444   ins_cost(150);
 3445   ins_encode %{
 3446     __ divss($dst$$XMMRegister, $src$$Address);
 3447   %}
 3448   ins_pipe(pipe_slow);
 3449 %}
 3450 
 3451 instruct divF_imm(regF dst, immF con) %{
 3452   predicate((UseSSE>=1) && (UseAVX == 0));
 3453   match(Set dst (DivF dst con));
 3454   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3455   ins_cost(150);
 3456   ins_encode %{
 3457     __ divss($dst$$XMMRegister, $constantaddress($con));
 3458   %}
 3459   ins_pipe(pipe_slow);
 3460 %}
 3461 
 3462 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3463   predicate(UseAVX > 0);
 3464   match(Set dst (DivF src1 src2));
 3465 
 3466   format %{ "vdivss  $dst, $src1, $src2" %}
 3467   ins_cost(150);
 3468   ins_encode %{
 3469     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3470   %}
 3471   ins_pipe(pipe_slow);
 3472 %}
 3473 
 3474 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3475   predicate(UseAVX > 0);
 3476   match(Set dst (DivF src1 (LoadF src2)));
 3477 
 3478   format %{ "vdivss  $dst, $src1, $src2" %}
 3479   ins_cost(150);
 3480   ins_encode %{
 3481     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3482   %}
 3483   ins_pipe(pipe_slow);
 3484 %}
 3485 
 3486 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3487   predicate(UseAVX > 0);
 3488   match(Set dst (DivF src con));
 3489 
 3490   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3491   ins_cost(150);
 3492   ins_encode %{
 3493     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3494   %}
 3495   ins_pipe(pipe_slow);
 3496 %}
 3497 
 3498 instruct divD_reg(regD dst, regD src) %{
 3499   predicate((UseSSE>=2) && (UseAVX == 0));
 3500   match(Set dst (DivD dst src));
 3501 
 3502   format %{ "divsd   $dst, $src" %}
 3503   ins_cost(150);
 3504   ins_encode %{
 3505     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3506   %}
 3507   ins_pipe(pipe_slow);
 3508 %}
 3509 
 3510 instruct divD_mem(regD dst, memory src) %{
 3511   predicate((UseSSE>=2) && (UseAVX == 0));
 3512   match(Set dst (DivD dst (LoadD src)));
 3513 
 3514   format %{ "divsd   $dst, $src" %}
 3515   ins_cost(150);
 3516   ins_encode %{
 3517     __ divsd($dst$$XMMRegister, $src$$Address);
 3518   %}
 3519   ins_pipe(pipe_slow);
 3520 %}
 3521 
 3522 instruct divD_imm(regD dst, immD con) %{
 3523   predicate((UseSSE>=2) && (UseAVX == 0));
 3524   match(Set dst (DivD dst con));
 3525   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3526   ins_cost(150);
 3527   ins_encode %{
 3528     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3529   %}
 3530   ins_pipe(pipe_slow);
 3531 %}
 3532 
 3533 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3534   predicate(UseAVX > 0);
 3535   match(Set dst (DivD src1 src2));
 3536 
 3537   format %{ "vdivsd  $dst, $src1, $src2" %}
 3538   ins_cost(150);
 3539   ins_encode %{
 3540     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3541   %}
 3542   ins_pipe(pipe_slow);
 3543 %}
 3544 
 3545 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3546   predicate(UseAVX > 0);
 3547   match(Set dst (DivD src1 (LoadD src2)));
 3548 
 3549   format %{ "vdivsd  $dst, $src1, $src2" %}
 3550   ins_cost(150);
 3551   ins_encode %{
 3552     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3553   %}
 3554   ins_pipe(pipe_slow);
 3555 %}
 3556 
 3557 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3558   predicate(UseAVX > 0);
 3559   match(Set dst (DivD src con));
 3560 
 3561   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3562   ins_cost(150);
 3563   ins_encode %{
 3564     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3565   %}
 3566   ins_pipe(pipe_slow);
 3567 %}
 3568 
 3569 instruct absF_reg(regF dst) %{
 3570   predicate((UseSSE>=1) && (UseAVX == 0));
 3571   match(Set dst (AbsF dst));
 3572   ins_cost(150);
 3573   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3574   ins_encode %{
 3575     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3576   %}
 3577   ins_pipe(pipe_slow);
 3578 %}
 3579 
 3580 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3581   predicate(UseAVX > 0);
 3582   match(Set dst (AbsF src));
 3583   ins_cost(150);
 3584   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3585   ins_encode %{
 3586     int vlen_enc = Assembler::AVX_128bit;
 3587     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3588               ExternalAddress(float_signmask()), vlen_enc);
 3589   %}
 3590   ins_pipe(pipe_slow);
 3591 %}
 3592 
 3593 instruct absD_reg(regD dst) %{
 3594   predicate((UseSSE>=2) && (UseAVX == 0));
 3595   match(Set dst (AbsD dst));
 3596   ins_cost(150);
 3597   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3598             "# abs double by sign masking" %}
 3599   ins_encode %{
 3600     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3601   %}
 3602   ins_pipe(pipe_slow);
 3603 %}
 3604 
 3605 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3606   predicate(UseAVX > 0);
 3607   match(Set dst (AbsD src));
 3608   ins_cost(150);
 3609   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3610             "# abs double by sign masking" %}
 3611   ins_encode %{
 3612     int vlen_enc = Assembler::AVX_128bit;
 3613     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3614               ExternalAddress(double_signmask()), vlen_enc);
 3615   %}
 3616   ins_pipe(pipe_slow);
 3617 %}
 3618 
 3619 instruct negF_reg(regF dst) %{
 3620   predicate((UseSSE>=1) && (UseAVX == 0));
 3621   match(Set dst (NegF dst));
 3622   ins_cost(150);
 3623   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3624   ins_encode %{
 3625     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3626   %}
 3627   ins_pipe(pipe_slow);
 3628 %}
 3629 
 3630 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3631   predicate(UseAVX > 0);
 3632   match(Set dst (NegF src));
 3633   ins_cost(150);
 3634   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3635   ins_encode %{
 3636     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3637                  ExternalAddress(float_signflip()));
 3638   %}
 3639   ins_pipe(pipe_slow);
 3640 %}
 3641 
 3642 instruct negD_reg(regD dst) %{
 3643   predicate((UseSSE>=2) && (UseAVX == 0));
 3644   match(Set dst (NegD dst));
 3645   ins_cost(150);
 3646   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3647             "# neg double by sign flipping" %}
 3648   ins_encode %{
 3649     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3650   %}
 3651   ins_pipe(pipe_slow);
 3652 %}
 3653 
 3654 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3655   predicate(UseAVX > 0);
 3656   match(Set dst (NegD src));
 3657   ins_cost(150);
 3658   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3659             "# neg double by sign flipping" %}
 3660   ins_encode %{
 3661     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3662                  ExternalAddress(double_signflip()));
 3663   %}
 3664   ins_pipe(pipe_slow);
 3665 %}
 3666 
 3667 // sqrtss instruction needs destination register to be pre initialized for best performance
 3668 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3669 instruct sqrtF_reg(regF dst) %{
 3670   predicate(UseSSE>=1);
 3671   match(Set dst (SqrtF dst));
 3672   format %{ "sqrtss  $dst, $dst" %}
 3673   ins_encode %{
 3674     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3675   %}
 3676   ins_pipe(pipe_slow);
 3677 %}
 3678 
 3679 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3680 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3681 instruct sqrtD_reg(regD dst) %{
 3682   predicate(UseSSE>=2);
 3683   match(Set dst (SqrtD dst));
 3684   format %{ "sqrtsd  $dst, $dst" %}
 3685   ins_encode %{
 3686     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3687   %}
 3688   ins_pipe(pipe_slow);
 3689 %}
 3690 
 3691 
 3692 // ---------------------------------------- VectorReinterpret ------------------------------------
 3693 instruct reinterpret_mask(kReg dst) %{
 3694   predicate(n->bottom_type()->isa_vectmask() &&
 3695             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3696   match(Set dst (VectorReinterpret dst));
 3697   ins_cost(125);
 3698   format %{ "vector_reinterpret $dst\t!" %}
 3699   ins_encode %{
 3700     // empty
 3701   %}
 3702   ins_pipe( pipe_slow );
 3703 %}
 3704 
 3705 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3706   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3707             n->bottom_type()->isa_vectmask() &&
 3708             n->in(1)->bottom_type()->isa_vectmask() &&
 3709             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3710             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3711   match(Set dst (VectorReinterpret src));
 3712   effect(TEMP xtmp);
 3713   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3714   ins_encode %{
 3715      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3716      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3717      assert(src_sz == dst_sz , "src and dst size mismatch");
 3718      int vlen_enc = vector_length_encoding(src_sz);
 3719      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3720      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3721   %}
 3722   ins_pipe( pipe_slow );
 3723 %}
 3724 
 3725 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3726   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3727             n->bottom_type()->isa_vectmask() &&
 3728             n->in(1)->bottom_type()->isa_vectmask() &&
 3729             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3730              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3731             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3732   match(Set dst (VectorReinterpret src));
 3733   effect(TEMP xtmp);
 3734   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3735   ins_encode %{
 3736      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3737      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3738      assert(src_sz == dst_sz , "src and dst size mismatch");
 3739      int vlen_enc = vector_length_encoding(src_sz);
 3740      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3741      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3742   %}
 3743   ins_pipe( pipe_slow );
 3744 %}
 3745 
 3746 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3747   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3748             n->bottom_type()->isa_vectmask() &&
 3749             n->in(1)->bottom_type()->isa_vectmask() &&
 3750             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3751              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3752             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3753   match(Set dst (VectorReinterpret src));
 3754   effect(TEMP xtmp);
 3755   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3756   ins_encode %{
 3757      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3758      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3759      assert(src_sz == dst_sz , "src and dst size mismatch");
 3760      int vlen_enc = vector_length_encoding(src_sz);
 3761      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3762      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3763   %}
 3764   ins_pipe( pipe_slow );
 3765 %}
 3766 
 3767 instruct reinterpret(vec dst) %{
 3768   predicate(!n->bottom_type()->isa_vectmask() &&
 3769             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3770   match(Set dst (VectorReinterpret dst));
 3771   ins_cost(125);
 3772   format %{ "vector_reinterpret $dst\t!" %}
 3773   ins_encode %{
 3774     // empty
 3775   %}
 3776   ins_pipe( pipe_slow );
 3777 %}
 3778 
 3779 instruct reinterpret_expand(vec dst, vec src) %{
 3780   predicate(UseAVX == 0 &&
 3781             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3782   match(Set dst (VectorReinterpret src));
 3783   ins_cost(125);
 3784   effect(TEMP dst);
 3785   format %{ "vector_reinterpret_expand $dst,$src" %}
 3786   ins_encode %{
 3787     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3788     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3789 
 3790     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3791     if (src_vlen_in_bytes == 4) {
 3792       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), noreg);
 3793     } else {
 3794       assert(src_vlen_in_bytes == 8, "");
 3795       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), noreg);
 3796     }
 3797     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3798   %}
 3799   ins_pipe( pipe_slow );
 3800 %}
 3801 
 3802 instruct vreinterpret_expand4(legVec dst, vec src) %{
 3803   predicate(UseAVX > 0 &&
 3804             !n->bottom_type()->isa_vectmask() &&
 3805             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3806             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3807   match(Set dst (VectorReinterpret src));
 3808   ins_cost(125);
 3809   format %{ "vector_reinterpret_expand $dst,$src" %}
 3810   ins_encode %{
 3811     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, noreg);
 3812   %}
 3813   ins_pipe( pipe_slow );
 3814 %}
 3815 
 3816 
 3817 instruct vreinterpret_expand(legVec dst, vec src) %{
 3818   predicate(UseAVX > 0 &&
 3819             !n->bottom_type()->isa_vectmask() &&
 3820             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3821             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3822   match(Set dst (VectorReinterpret src));
 3823   ins_cost(125);
 3824   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3825   ins_encode %{
 3826     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3827       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3828       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3829       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3830       default: ShouldNotReachHere();
 3831     }
 3832   %}
 3833   ins_pipe( pipe_slow );
 3834 %}
 3835 
 3836 instruct reinterpret_shrink(vec dst, legVec src) %{
 3837   predicate(!n->bottom_type()->isa_vectmask() &&
 3838             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3839   match(Set dst (VectorReinterpret src));
 3840   ins_cost(125);
 3841   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3842   ins_encode %{
 3843     switch (Matcher::vector_length_in_bytes(this)) {
 3844       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3845       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3846       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3847       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3848       default: ShouldNotReachHere();
 3849     }
 3850   %}
 3851   ins_pipe( pipe_slow );
 3852 %}
 3853 
 3854 // ----------------------------------------------------------------------------------------------------
 3855 
 3856 #ifdef _LP64
 3857 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3858   match(Set dst (RoundDoubleMode src rmode));
 3859   format %{ "roundsd $dst,$src" %}
 3860   ins_cost(150);
 3861   ins_encode %{
 3862     assert(UseSSE >= 4, "required");
 3863     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3864   %}
 3865   ins_pipe(pipe_slow);
 3866 %}
 3867 
 3868 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3869   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3870   format %{ "roundsd $dst,$src" %}
 3871   ins_cost(150);
 3872   ins_encode %{
 3873     assert(UseSSE >= 4, "required");
 3874     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3875   %}
 3876   ins_pipe(pipe_slow);
 3877 %}
 3878 
 3879 instruct roundD_imm(legRegD dst, immD con, immU8 rmode) %{
 3880   match(Set dst (RoundDoubleMode con rmode));
 3881   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3882   ins_cost(150);
 3883   ins_encode %{
 3884     assert(UseSSE >= 4, "required");
 3885     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, noreg);
 3886   %}
 3887   ins_pipe(pipe_slow);
 3888 %}
 3889 
 3890 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3891   predicate(Matcher::vector_length(n) < 8);
 3892   match(Set dst (RoundDoubleModeV src rmode));
 3893   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3894   ins_encode %{
 3895     assert(UseAVX > 0, "required");
 3896     int vlen_enc = vector_length_encoding(this);
 3897     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3898   %}
 3899   ins_pipe( pipe_slow );
 3900 %}
 3901 
 3902 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3903   predicate(Matcher::vector_length(n) == 8);
 3904   match(Set dst (RoundDoubleModeV src rmode));
 3905   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3906   ins_encode %{
 3907     assert(UseAVX > 2, "required");
 3908     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3909   %}
 3910   ins_pipe( pipe_slow );
 3911 %}
 3912 
 3913 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3914   predicate(Matcher::vector_length(n) < 8);
 3915   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3916   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3917   ins_encode %{
 3918     assert(UseAVX > 0, "required");
 3919     int vlen_enc = vector_length_encoding(this);
 3920     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3921   %}
 3922   ins_pipe( pipe_slow );
 3923 %}
 3924 
 3925 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3926   predicate(Matcher::vector_length(n) == 8);
 3927   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3928   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3929   ins_encode %{
 3930     assert(UseAVX > 2, "required");
 3931     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3932   %}
 3933   ins_pipe( pipe_slow );
 3934 %}
 3935 #endif // _LP64
 3936 
 3937 instruct onspinwait() %{
 3938   match(OnSpinWait);
 3939   ins_cost(200);
 3940 
 3941   format %{
 3942     $$template
 3943     $$emit$$"pause\t! membar_onspinwait"
 3944   %}
 3945   ins_encode %{
 3946     __ pause();
 3947   %}
 3948   ins_pipe(pipe_slow);
 3949 %}
 3950 
 3951 // a * b + c
 3952 instruct fmaD_reg(regD a, regD b, regD c) %{
 3953   predicate(UseFMA);
 3954   match(Set c (FmaD  c (Binary a b)));
 3955   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3956   ins_cost(150);
 3957   ins_encode %{
 3958     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3959   %}
 3960   ins_pipe( pipe_slow );
 3961 %}
 3962 
 3963 // a * b + c
 3964 instruct fmaF_reg(regF a, regF b, regF c) %{
 3965   predicate(UseFMA);
 3966   match(Set c (FmaF  c (Binary a b)));
 3967   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3968   ins_cost(150);
 3969   ins_encode %{
 3970     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3971   %}
 3972   ins_pipe( pipe_slow );
 3973 %}
 3974 
 3975 // ====================VECTOR INSTRUCTIONS=====================================
 3976 
 3977 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3978 instruct MoveVec2Leg(legVec dst, vec src) %{
 3979   match(Set dst src);
 3980   format %{ "" %}
 3981   ins_encode %{
 3982     ShouldNotReachHere();
 3983   %}
 3984   ins_pipe( fpu_reg_reg );
 3985 %}
 3986 
 3987 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3988   match(Set dst src);
 3989   format %{ "" %}
 3990   ins_encode %{
 3991     ShouldNotReachHere();
 3992   %}
 3993   ins_pipe( fpu_reg_reg );
 3994 %}
 3995 
 3996 // ============================================================================
 3997 
 3998 // Load vectors generic operand pattern
 3999 instruct loadV(vec dst, memory mem) %{
 4000   match(Set dst (LoadVector mem));
 4001   ins_cost(125);
 4002   format %{ "load_vector $dst,$mem" %}
 4003   ins_encode %{
 4004     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 4005   %}
 4006   ins_pipe( pipe_slow );
 4007 %}
 4008 
 4009 // Store vectors generic operand pattern.
 4010 instruct storeV(memory mem, vec src) %{
 4011   match(Set mem (StoreVector mem src));
 4012   ins_cost(145);
 4013   format %{ "store_vector $mem,$src\n\t" %}
 4014   ins_encode %{
 4015     switch (Matcher::vector_length_in_bytes(this, $src)) {
 4016       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 4017       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 4018       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 4019       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 4020       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 4021       default: ShouldNotReachHere();
 4022     }
 4023   %}
 4024   ins_pipe( pipe_slow );
 4025 %}
 4026 
 4027 // ---------------------------------------- Gather ------------------------------------
 4028 
 4029 // Gather INT, LONG, FLOAT, DOUBLE
 4030 
 4031 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 4032   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 4033   match(Set dst (LoadVectorGather mem idx));
 4034   effect(TEMP dst, TEMP tmp, TEMP mask);
 4035   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 4036   ins_encode %{
 4037     assert(UseAVX >= 2, "sanity");
 4038 
 4039     int vlen_enc = vector_length_encoding(this);
 4040     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4041 
 4042     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 4043     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4044 
 4045     if (vlen_enc == Assembler::AVX_128bit) {
 4046       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4047     } else {
 4048       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4049     }
 4050     __ lea($tmp$$Register, $mem$$Address);
 4051     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 4052   %}
 4053   ins_pipe( pipe_slow );
 4054 %}
 4055 
 4056 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 4057   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 4058   match(Set dst (LoadVectorGather mem idx));
 4059   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 4060   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 4061   ins_encode %{
 4062     assert(UseAVX > 2, "sanity");
 4063 
 4064     int vlen_enc = vector_length_encoding(this);
 4065     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4066 
 4067     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4068 
 4069     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4070     __ lea($tmp$$Register, $mem$$Address);
 4071     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4072   %}
 4073   ins_pipe( pipe_slow );
 4074 %}
 4075 
 4076 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4077   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 4078   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 4079   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 4080   ins_encode %{
 4081     assert(UseAVX > 2, "sanity");
 4082     int vlen_enc = vector_length_encoding(this);
 4083     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4084     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4085     // Note: Since gather instruction partially updates the opmask register used
 4086     // for predication hense moving mask operand to a temporary.
 4087     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4088     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4089     __ lea($tmp$$Register, $mem$$Address);
 4090     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4091   %}
 4092   ins_pipe( pipe_slow );
 4093 %}
 4094 // ====================Scatter=======================================
 4095 
 4096 // Scatter INT, LONG, FLOAT, DOUBLE
 4097 
 4098 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4099   predicate(UseAVX > 2);
 4100   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4101   effect(TEMP tmp, TEMP ktmp);
 4102   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4103   ins_encode %{
 4104     int vlen_enc = vector_length_encoding(this, $src);
 4105     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4106 
 4107     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4108     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4109 
 4110     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), noreg);
 4111     __ lea($tmp$$Register, $mem$$Address);
 4112     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4113   %}
 4114   ins_pipe( pipe_slow );
 4115 %}
 4116 
 4117 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4118   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4119   effect(TEMP tmp, TEMP ktmp);
 4120   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4121   ins_encode %{
 4122     int vlen_enc = vector_length_encoding(this, $src);
 4123     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4124     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4125     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4126     // Note: Since scatter instruction partially updates the opmask register used
 4127     // for predication hense moving mask operand to a temporary.
 4128     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4129     __ lea($tmp$$Register, $mem$$Address);
 4130     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4131   %}
 4132   ins_pipe( pipe_slow );
 4133 %}
 4134 
 4135 // ====================REPLICATE=======================================
 4136 
 4137 // Replicate byte scalar to be vector
 4138 instruct vReplB_reg(vec dst, rRegI src) %{
 4139   predicate(UseAVX >= 2);
 4140   match(Set dst (ReplicateB src));
 4141   format %{ "replicateB $dst,$src" %}
 4142   ins_encode %{
 4143     uint vlen = Matcher::vector_length(this);
 4144     int vlen_enc = vector_length_encoding(this);
 4145     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4146       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4147       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4148     } else {
 4149       __ movdl($dst$$XMMRegister, $src$$Register);
 4150       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4151     }
 4152   %}
 4153   ins_pipe( pipe_slow );
 4154 %}
 4155 
 4156 instruct ReplB_reg(vec dst, rRegI src) %{
 4157   predicate(UseAVX < 2);
 4158   match(Set dst (ReplicateB src));
 4159   format %{ "replicateB $dst,$src" %}
 4160   ins_encode %{
 4161     uint vlen = Matcher::vector_length(this);
 4162     __ movdl($dst$$XMMRegister, $src$$Register);
 4163     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4164     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4165     if (vlen >= 16) {
 4166       assert(vlen == 16, "");
 4167       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4168     }
 4169   %}
 4170   ins_pipe( pipe_slow );
 4171 %}
 4172 
 4173 instruct ReplB_mem(vec dst, memory mem) %{
 4174   predicate(UseAVX >= 2);
 4175   match(Set dst (ReplicateB (LoadB mem)));
 4176   format %{ "replicateB $dst,$mem" %}
 4177   ins_encode %{
 4178     int vlen_enc = vector_length_encoding(this);
 4179     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4180   %}
 4181   ins_pipe( pipe_slow );
 4182 %}
 4183 
 4184 // ====================ReplicateS=======================================
 4185 
 4186 instruct vReplS_reg(vec dst, rRegI src) %{
 4187   predicate(UseAVX >= 2);
 4188   match(Set dst (ReplicateS src));
 4189   format %{ "replicateS $dst,$src" %}
 4190   ins_encode %{
 4191     uint vlen = Matcher::vector_length(this);
 4192     int vlen_enc = vector_length_encoding(this);
 4193     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4194       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4195       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4196     } else {
 4197       __ movdl($dst$$XMMRegister, $src$$Register);
 4198       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4199     }
 4200   %}
 4201   ins_pipe( pipe_slow );
 4202 %}
 4203 
 4204 instruct ReplS_reg(vec dst, rRegI src) %{
 4205   predicate(UseAVX < 2);
 4206   match(Set dst (ReplicateS src));
 4207   format %{ "replicateS $dst,$src" %}
 4208   ins_encode %{
 4209     uint vlen = Matcher::vector_length(this);
 4210     int vlen_enc = vector_length_encoding(this);
 4211     __ movdl($dst$$XMMRegister, $src$$Register);
 4212     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4213     if (vlen >= 8) {
 4214       assert(vlen == 8, "");
 4215       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4216     }
 4217   %}
 4218   ins_pipe( pipe_slow );
 4219 %}
 4220 
 4221 instruct ReplS_mem(vec dst, memory mem) %{
 4222   predicate(UseAVX >= 2);
 4223   match(Set dst (ReplicateS (LoadS mem)));
 4224   format %{ "replicateS $dst,$mem" %}
 4225   ins_encode %{
 4226     int vlen_enc = vector_length_encoding(this);
 4227     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4228   %}
 4229   ins_pipe( pipe_slow );
 4230 %}
 4231 
 4232 // ====================ReplicateI=======================================
 4233 
 4234 instruct ReplI_reg(vec dst, rRegI src) %{
 4235   match(Set dst (ReplicateI src));
 4236   format %{ "replicateI $dst,$src" %}
 4237   ins_encode %{
 4238     uint vlen = Matcher::vector_length(this);
 4239     int vlen_enc = vector_length_encoding(this);
 4240     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4241       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4242     } else if (VM_Version::supports_avx2()) {
 4243       __ movdl($dst$$XMMRegister, $src$$Register);
 4244       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4245     } else {
 4246       __ movdl($dst$$XMMRegister, $src$$Register);
 4247       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4248     }
 4249   %}
 4250   ins_pipe( pipe_slow );
 4251 %}
 4252 
 4253 instruct ReplI_mem(vec dst, memory mem) %{
 4254   match(Set dst (ReplicateI (LoadI mem)));
 4255   format %{ "replicateI $dst,$mem" %}
 4256   ins_encode %{
 4257     int vlen_enc = vector_length_encoding(this);
 4258     if (VM_Version::supports_avx2()) {
 4259       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4260     } else if (VM_Version::supports_avx()) {
 4261       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4262     } else {
 4263       __ movdl($dst$$XMMRegister, $mem$$Address);
 4264       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4265     }
 4266   %}
 4267   ins_pipe( pipe_slow );
 4268 %}
 4269 
 4270 instruct ReplI_imm(vec dst, immI con) %{
 4271   match(Set dst (ReplicateB con));
 4272   match(Set dst (ReplicateS con));
 4273   match(Set dst (ReplicateI con));
 4274   format %{ "replicateI $dst,$con" %}
 4275   ins_encode %{
 4276     InternalAddress addr = $constantaddress(Matcher::vector_element_basic_type(this),
 4277         vreplicate_imm(Matcher::vector_element_basic_type(this), $con$$constant,
 4278             (VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 4 : 8) : 8) /
 4279                 type2aelembytes(Matcher::vector_element_basic_type(this))));
 4280     BasicType bt = Matcher::vector_element_basic_type(this);
 4281     int vlen = Matcher::vector_length_in_bytes(this);
 4282     __ load_constant_vector(bt, $dst$$XMMRegister, addr, vlen);
 4283   %}
 4284   ins_pipe( pipe_slow );
 4285 %}
 4286 
 4287 // Replicate scalar zero to be vector
 4288 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4289   match(Set dst (ReplicateB zero));
 4290   match(Set dst (ReplicateS zero));
 4291   match(Set dst (ReplicateI zero));
 4292   format %{ "replicateI $dst,$zero" %}
 4293   ins_encode %{
 4294     int vlen_enc = vector_length_encoding(this);
 4295     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4296       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4297     } else {
 4298       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4299     }
 4300   %}
 4301   ins_pipe( fpu_reg_reg );
 4302 %}
 4303 
 4304 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4305   predicate(UseSSE >= 2);
 4306   match(Set dst (ReplicateB con));
 4307   match(Set dst (ReplicateS con));
 4308   match(Set dst (ReplicateI con));
 4309   format %{ "vallones $dst" %}
 4310   ins_encode %{
 4311     int vector_len = vector_length_encoding(this);
 4312     __ vallones($dst$$XMMRegister, vector_len);
 4313   %}
 4314   ins_pipe( pipe_slow );
 4315 %}
 4316 
 4317 // ====================ReplicateL=======================================
 4318 
 4319 #ifdef _LP64
 4320 // Replicate long (8 byte) scalar to be vector
 4321 instruct ReplL_reg(vec dst, rRegL src) %{
 4322   match(Set dst (ReplicateL src));
 4323   format %{ "replicateL $dst,$src" %}
 4324   ins_encode %{
 4325     int vlen = Matcher::vector_length(this);
 4326     int vlen_enc = vector_length_encoding(this);
 4327     if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4328       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4329     } else if (VM_Version::supports_avx2()) {
 4330       __ movdq($dst$$XMMRegister, $src$$Register);
 4331       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4332     } else {
 4333       __ movdq($dst$$XMMRegister, $src$$Register);
 4334       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4335     }
 4336   %}
 4337   ins_pipe( pipe_slow );
 4338 %}
 4339 #else // _LP64
 4340 // Replicate long (8 byte) scalar to be vector
 4341 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4342   predicate(Matcher::vector_length(n) <= 4);
 4343   match(Set dst (ReplicateL src));
 4344   effect(TEMP dst, USE src, TEMP tmp);
 4345   format %{ "replicateL $dst,$src" %}
 4346   ins_encode %{
 4347     uint vlen = Matcher::vector_length(this);
 4348     if (vlen == 2) {
 4349       __ movdl($dst$$XMMRegister, $src$$Register);
 4350       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4351       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4352       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4353     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4354       int vlen_enc = Assembler::AVX_256bit;
 4355       __ movdl($dst$$XMMRegister, $src$$Register);
 4356       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4357       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4358       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4359     } else {
 4360       __ movdl($dst$$XMMRegister, $src$$Register);
 4361       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4362       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4363       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4364       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4365     }
 4366   %}
 4367   ins_pipe( pipe_slow );
 4368 %}
 4369 
 4370 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4371   predicate(Matcher::vector_length(n) == 8);
 4372   match(Set dst (ReplicateL src));
 4373   effect(TEMP dst, USE src, TEMP tmp);
 4374   format %{ "replicateL $dst,$src" %}
 4375   ins_encode %{
 4376     if (VM_Version::supports_avx512vl()) {
 4377       __ movdl($dst$$XMMRegister, $src$$Register);
 4378       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4379       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4380       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4381       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4382       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4383     } else {
 4384       int vlen_enc = Assembler::AVX_512bit;
 4385       __ movdl($dst$$XMMRegister, $src$$Register);
 4386       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4387       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4388       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4389     }
 4390   %}
 4391   ins_pipe( pipe_slow );
 4392 %}
 4393 #endif // _LP64
 4394 
 4395 instruct ReplL_mem(vec dst, memory mem) %{
 4396   match(Set dst (ReplicateL (LoadL mem)));
 4397   format %{ "replicateL $dst,$mem" %}
 4398   ins_encode %{
 4399     int vlen_enc = vector_length_encoding(this);
 4400     if (VM_Version::supports_avx2()) {
 4401       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4402     } else if (VM_Version::supports_sse3()) {
 4403       __ movddup($dst$$XMMRegister, $mem$$Address);
 4404     } else {
 4405       __ movq($dst$$XMMRegister, $mem$$Address);
 4406       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4407     }
 4408   %}
 4409   ins_pipe( pipe_slow );
 4410 %}
 4411 
 4412 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4413 instruct ReplL_imm(vec dst, immL con) %{
 4414   match(Set dst (ReplicateL con));
 4415   format %{ "replicateL $dst,$con" %}
 4416   ins_encode %{
 4417     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, 1));
 4418     int vlen = Matcher::vector_length_in_bytes(this);
 4419     __ load_constant_vector(T_LONG, $dst$$XMMRegister, addr, vlen);
 4420   %}
 4421   ins_pipe( pipe_slow );
 4422 %}
 4423 
 4424 instruct ReplL_zero(vec dst, immL0 zero) %{
 4425   match(Set dst (ReplicateL zero));
 4426   format %{ "replicateL $dst,$zero" %}
 4427   ins_encode %{
 4428     int vlen_enc = vector_length_encoding(this);
 4429     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
 4430       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4431     } else {
 4432       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4433     }
 4434   %}
 4435   ins_pipe( fpu_reg_reg );
 4436 %}
 4437 
 4438 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4439   predicate(UseSSE >= 2);
 4440   match(Set dst (ReplicateL con));
 4441   format %{ "vallones $dst" %}
 4442   ins_encode %{
 4443     int vector_len = vector_length_encoding(this);
 4444     __ vallones($dst$$XMMRegister, vector_len);
 4445   %}
 4446   ins_pipe( pipe_slow );
 4447 %}
 4448 
 4449 // ====================ReplicateF=======================================
 4450 
 4451 instruct vReplF_reg(vec dst, vlRegF src) %{
 4452   predicate(UseAVX > 0);
 4453   match(Set dst (ReplicateF src));
 4454   format %{ "replicateF $dst,$src" %}
 4455   ins_encode %{
 4456     uint vlen = Matcher::vector_length(this);
 4457     int vlen_enc = vector_length_encoding(this);
 4458     if (vlen <= 4) {
 4459       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4460     } else if (VM_Version::supports_avx2()) {
 4461       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4462     } else {
 4463       assert(vlen == 8, "sanity");
 4464       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 0x00, Assembler::AVX_128bit);
 4465       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4466     }
 4467   %}
 4468   ins_pipe( pipe_slow );
 4469 %}
 4470 
 4471 instruct ReplF_reg(vec dst, vlRegF src) %{
 4472   predicate(UseAVX == 0);
 4473   match(Set dst (ReplicateF src));
 4474   format %{ "replicateF $dst,$src" %}
 4475   ins_encode %{
 4476     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4477   %}
 4478   ins_pipe( pipe_slow );
 4479 %}
 4480 
 4481 instruct ReplF_mem(vec dst, memory mem) %{
 4482   predicate(UseAVX > 0);
 4483   match(Set dst (ReplicateF (LoadF mem)));
 4484   format %{ "replicateF $dst,$mem" %}
 4485   ins_encode %{
 4486     int vlen_enc = vector_length_encoding(this);
 4487     __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4488   %}
 4489   ins_pipe( pipe_slow );
 4490 %}
 4491 
 4492 // Replicate float scalar immediate to be vector by loading from const table.
 4493 instruct ReplF_imm(vec dst, immF con) %{
 4494   match(Set dst (ReplicateF con));
 4495   format %{ "replicateF $dst,$con" %}
 4496   ins_encode %{
 4497     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant,
 4498         VM_Version::supports_sse3() ? (VM_Version::supports_avx() ? 1 : 2) : 2));
 4499     int vlen = Matcher::vector_length_in_bytes(this);
 4500     __ load_constant_vector(T_FLOAT, $dst$$XMMRegister, addr, vlen);
 4501   %}
 4502   ins_pipe( pipe_slow );
 4503 %}
 4504 
 4505 instruct ReplF_zero(vec dst, immF0 zero) %{
 4506   match(Set dst (ReplicateF zero));
 4507   format %{ "replicateF $dst,$zero" %}
 4508   ins_encode %{
 4509     int vlen_enc = vector_length_encoding(this);
 4510     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4511       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4512     } else {
 4513       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4514     }
 4515   %}
 4516   ins_pipe( fpu_reg_reg );
 4517 %}
 4518 
 4519 // ====================ReplicateD=======================================
 4520 
 4521 // Replicate double (8 bytes) scalar to be vector
 4522 instruct vReplD_reg(vec dst, vlRegD src) %{
 4523   predicate(UseSSE >= 3);
 4524   match(Set dst (ReplicateD src));
 4525   format %{ "replicateD $dst,$src" %}
 4526   ins_encode %{
 4527     uint vlen = Matcher::vector_length(this);
 4528     int vlen_enc = vector_length_encoding(this);
 4529     if (vlen <= 2) {
 4530       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4531     } else if (VM_Version::supports_avx2()) {
 4532       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4533     } else {
 4534       assert(vlen == 4, "sanity");
 4535       __ movddup($dst$$XMMRegister, $src$$XMMRegister);
 4536       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4537     }
 4538   %}
 4539   ins_pipe( pipe_slow );
 4540 %}
 4541 
 4542 instruct ReplD_reg(vec dst, vlRegD src) %{
 4543   predicate(UseSSE < 3);
 4544   match(Set dst (ReplicateD src));
 4545   format %{ "replicateD $dst,$src" %}
 4546   ins_encode %{
 4547     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4548   %}
 4549   ins_pipe( pipe_slow );
 4550 %}
 4551 
 4552 instruct ReplD_mem(vec dst, memory mem) %{
 4553   predicate(UseSSE >= 3);
 4554   match(Set dst (ReplicateD (LoadD mem)));
 4555   format %{ "replicateD $dst,$mem" %}
 4556   ins_encode %{
 4557     if (Matcher::vector_length(this) >= 4) {
 4558       int vlen_enc = vector_length_encoding(this);
 4559       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4560     } else {
 4561       __ movddup($dst$$XMMRegister, $mem$$Address);
 4562     }
 4563   %}
 4564   ins_pipe( pipe_slow );
 4565 %}
 4566 
 4567 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4568 instruct ReplD_imm(vec dst, immD con) %{
 4569   match(Set dst (ReplicateD con));
 4570   format %{ "replicateD $dst,$con" %}
 4571   ins_encode %{
 4572     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, 1));
 4573     int vlen = Matcher::vector_length_in_bytes(this);
 4574     __ load_constant_vector(T_DOUBLE, $dst$$XMMRegister, addr, vlen);
 4575   %}
 4576   ins_pipe( pipe_slow );
 4577 %}
 4578 
 4579 instruct ReplD_zero(vec dst, immD0 zero) %{
 4580   match(Set dst (ReplicateD zero));
 4581   format %{ "replicateD $dst,$zero" %}
 4582   ins_encode %{
 4583     int vlen_enc = vector_length_encoding(this);
 4584     if (VM_Version::supports_evex() && !VM_Version::supports_avx512vldq()) {
 4585       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4586     } else {
 4587       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4588     }
 4589   %}
 4590   ins_pipe( fpu_reg_reg );
 4591 %}
 4592 
 4593 // ====================VECTOR INSERT=======================================
 4594 
 4595 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4596   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4597   match(Set dst (VectorInsert (Binary dst val) idx));
 4598   format %{ "vector_insert $dst,$val,$idx" %}
 4599   ins_encode %{
 4600     assert(UseSSE >= 4, "required");
 4601     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4602 
 4603     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4604 
 4605     assert(is_integral_type(elem_bt), "");
 4606     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4607 
 4608     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4609   %}
 4610   ins_pipe( pipe_slow );
 4611 %}
 4612 
 4613 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4614   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4615   match(Set dst (VectorInsert (Binary src val) idx));
 4616   effect(TEMP vtmp);
 4617   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4618   ins_encode %{
 4619     int vlen_enc = Assembler::AVX_256bit;
 4620     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4621     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4622     int log2epr = log2(elem_per_lane);
 4623 
 4624     assert(is_integral_type(elem_bt), "sanity");
 4625     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4626 
 4627     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4628     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4629     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4630     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4631     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4632   %}
 4633   ins_pipe( pipe_slow );
 4634 %}
 4635 
 4636 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4637   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4638   match(Set dst (VectorInsert (Binary src val) idx));
 4639   effect(TEMP vtmp);
 4640   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4641   ins_encode %{
 4642     assert(UseAVX > 2, "sanity");
 4643 
 4644     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4645     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4646     int log2epr = log2(elem_per_lane);
 4647 
 4648     assert(is_integral_type(elem_bt), "");
 4649     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4650 
 4651     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4652     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4653     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4654     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4655     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4656   %}
 4657   ins_pipe( pipe_slow );
 4658 %}
 4659 
 4660 #ifdef _LP64
 4661 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4662   predicate(Matcher::vector_length(n) == 2);
 4663   match(Set dst (VectorInsert (Binary dst val) idx));
 4664   format %{ "vector_insert $dst,$val,$idx" %}
 4665   ins_encode %{
 4666     assert(UseSSE >= 4, "required");
 4667     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4668     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4669 
 4670     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4671   %}
 4672   ins_pipe( pipe_slow );
 4673 %}
 4674 
 4675 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4676   predicate(Matcher::vector_length(n) == 4);
 4677   match(Set dst (VectorInsert (Binary src val) idx));
 4678   effect(TEMP vtmp);
 4679   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4680   ins_encode %{
 4681     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4682     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4683 
 4684     uint x_idx = $idx$$constant & right_n_bits(1);
 4685     uint y_idx = ($idx$$constant >> 1) & 1;
 4686     int vlen_enc = Assembler::AVX_256bit;
 4687     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4688     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4689     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4690   %}
 4691   ins_pipe( pipe_slow );
 4692 %}
 4693 
 4694 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4695   predicate(Matcher::vector_length(n) == 8);
 4696   match(Set dst (VectorInsert (Binary src val) idx));
 4697   effect(TEMP vtmp);
 4698   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4699   ins_encode %{
 4700     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4701     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4702 
 4703     uint x_idx = $idx$$constant & right_n_bits(1);
 4704     uint y_idx = ($idx$$constant >> 1) & 3;
 4705     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4706     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4707     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4708   %}
 4709   ins_pipe( pipe_slow );
 4710 %}
 4711 #endif
 4712 
 4713 instruct insertF(vec dst, regF val, immU8 idx) %{
 4714   predicate(Matcher::vector_length(n) < 8);
 4715   match(Set dst (VectorInsert (Binary dst val) idx));
 4716   format %{ "vector_insert $dst,$val,$idx" %}
 4717   ins_encode %{
 4718     assert(UseSSE >= 4, "sanity");
 4719 
 4720     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4721     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4722 
 4723     uint x_idx = $idx$$constant & right_n_bits(2);
 4724     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4725   %}
 4726   ins_pipe( pipe_slow );
 4727 %}
 4728 
 4729 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4730   predicate(Matcher::vector_length(n) >= 8);
 4731   match(Set dst (VectorInsert (Binary src val) idx));
 4732   effect(TEMP vtmp);
 4733   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4734   ins_encode %{
 4735     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4736     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4737 
 4738     int vlen = Matcher::vector_length(this);
 4739     uint x_idx = $idx$$constant & right_n_bits(2);
 4740     if (vlen == 8) {
 4741       uint y_idx = ($idx$$constant >> 2) & 1;
 4742       int vlen_enc = Assembler::AVX_256bit;
 4743       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4744       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4745       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4746     } else {
 4747       assert(vlen == 16, "sanity");
 4748       uint y_idx = ($idx$$constant >> 2) & 3;
 4749       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4750       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4751       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4752     }
 4753   %}
 4754   ins_pipe( pipe_slow );
 4755 %}
 4756 
 4757 #ifdef _LP64
 4758 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4759   predicate(Matcher::vector_length(n) == 2);
 4760   match(Set dst (VectorInsert (Binary dst val) idx));
 4761   effect(TEMP tmp);
 4762   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4763   ins_encode %{
 4764     assert(UseSSE >= 4, "sanity");
 4765     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4766     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4767 
 4768     __ movq($tmp$$Register, $val$$XMMRegister);
 4769     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4770   %}
 4771   ins_pipe( pipe_slow );
 4772 %}
 4773 
 4774 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4775   predicate(Matcher::vector_length(n) == 4);
 4776   match(Set dst (VectorInsert (Binary src val) idx));
 4777   effect(TEMP vtmp, TEMP tmp);
 4778   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4779   ins_encode %{
 4780     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4781     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4782 
 4783     uint x_idx = $idx$$constant & right_n_bits(1);
 4784     uint y_idx = ($idx$$constant >> 1) & 1;
 4785     int vlen_enc = Assembler::AVX_256bit;
 4786     __ movq($tmp$$Register, $val$$XMMRegister);
 4787     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4788     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4789     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4790   %}
 4791   ins_pipe( pipe_slow );
 4792 %}
 4793 
 4794 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4795   predicate(Matcher::vector_length(n) == 8);
 4796   match(Set dst (VectorInsert (Binary src val) idx));
 4797   effect(TEMP tmp, TEMP vtmp);
 4798   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4799   ins_encode %{
 4800     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4801     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4802 
 4803     uint x_idx = $idx$$constant & right_n_bits(1);
 4804     uint y_idx = ($idx$$constant >> 1) & 3;
 4805     __ movq($tmp$$Register, $val$$XMMRegister);
 4806     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4807     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4808     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4809   %}
 4810   ins_pipe( pipe_slow );
 4811 %}
 4812 #endif
 4813 
 4814 // ====================REDUCTION ARITHMETIC=======================================
 4815 
 4816 // =======================Int Reduction==========================================
 4817 
 4818 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4819   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4820   match(Set dst (AddReductionVI src1 src2));
 4821   match(Set dst (MulReductionVI src1 src2));
 4822   match(Set dst (AndReductionV  src1 src2));
 4823   match(Set dst ( OrReductionV  src1 src2));
 4824   match(Set dst (XorReductionV  src1 src2));
 4825   match(Set dst (MinReductionV  src1 src2));
 4826   match(Set dst (MaxReductionV  src1 src2));
 4827   effect(TEMP vtmp1, TEMP vtmp2);
 4828   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4829   ins_encode %{
 4830     int opcode = this->ideal_Opcode();
 4831     int vlen = Matcher::vector_length(this, $src2);
 4832     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4833   %}
 4834   ins_pipe( pipe_slow );
 4835 %}
 4836 
 4837 // =======================Long Reduction==========================================
 4838 
 4839 #ifdef _LP64
 4840 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4841   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4842   match(Set dst (AddReductionVL src1 src2));
 4843   match(Set dst (MulReductionVL src1 src2));
 4844   match(Set dst (AndReductionV  src1 src2));
 4845   match(Set dst ( OrReductionV  src1 src2));
 4846   match(Set dst (XorReductionV  src1 src2));
 4847   match(Set dst (MinReductionV  src1 src2));
 4848   match(Set dst (MaxReductionV  src1 src2));
 4849   effect(TEMP vtmp1, TEMP vtmp2);
 4850   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4851   ins_encode %{
 4852     int opcode = this->ideal_Opcode();
 4853     int vlen = Matcher::vector_length(this, $src2);
 4854     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4855   %}
 4856   ins_pipe( pipe_slow );
 4857 %}
 4858 
 4859 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4860   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4861   match(Set dst (AddReductionVL src1 src2));
 4862   match(Set dst (MulReductionVL src1 src2));
 4863   match(Set dst (AndReductionV  src1 src2));
 4864   match(Set dst ( OrReductionV  src1 src2));
 4865   match(Set dst (XorReductionV  src1 src2));
 4866   match(Set dst (MinReductionV  src1 src2));
 4867   match(Set dst (MaxReductionV  src1 src2));
 4868   effect(TEMP vtmp1, TEMP vtmp2);
 4869   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4870   ins_encode %{
 4871     int opcode = this->ideal_Opcode();
 4872     int vlen = Matcher::vector_length(this, $src2);
 4873     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4874   %}
 4875   ins_pipe( pipe_slow );
 4876 %}
 4877 #endif // _LP64
 4878 
 4879 // =======================Float Reduction==========================================
 4880 
 4881 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4882   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4883   match(Set dst (AddReductionVF dst src));
 4884   match(Set dst (MulReductionVF dst src));
 4885   effect(TEMP dst, TEMP vtmp);
 4886   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4887   ins_encode %{
 4888     int opcode = this->ideal_Opcode();
 4889     int vlen = Matcher::vector_length(this, $src);
 4890     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4891   %}
 4892   ins_pipe( pipe_slow );
 4893 %}
 4894 
 4895 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4896   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4897   match(Set dst (AddReductionVF dst src));
 4898   match(Set dst (MulReductionVF dst src));
 4899   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4900   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4901   ins_encode %{
 4902     int opcode = this->ideal_Opcode();
 4903     int vlen = Matcher::vector_length(this, $src);
 4904     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4905   %}
 4906   ins_pipe( pipe_slow );
 4907 %}
 4908 
 4909 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4910   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4911   match(Set dst (AddReductionVF dst src));
 4912   match(Set dst (MulReductionVF dst src));
 4913   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4914   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4915   ins_encode %{
 4916     int opcode = this->ideal_Opcode();
 4917     int vlen = Matcher::vector_length(this, $src);
 4918     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4919   %}
 4920   ins_pipe( pipe_slow );
 4921 %}
 4922 
 4923 // =======================Double Reduction==========================================
 4924 
 4925 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4926   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 4927   match(Set dst (AddReductionVD dst src));
 4928   match(Set dst (MulReductionVD dst src));
 4929   effect(TEMP dst, TEMP vtmp);
 4930   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 4931   ins_encode %{
 4932     int opcode = this->ideal_Opcode();
 4933     int vlen = Matcher::vector_length(this, $src);
 4934     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4935 %}
 4936   ins_pipe( pipe_slow );
 4937 %}
 4938 
 4939 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 4940   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 4941   match(Set dst (AddReductionVD dst src));
 4942   match(Set dst (MulReductionVD dst src));
 4943   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4944   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4945   ins_encode %{
 4946     int opcode = this->ideal_Opcode();
 4947     int vlen = Matcher::vector_length(this, $src);
 4948     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4949   %}
 4950   ins_pipe( pipe_slow );
 4951 %}
 4952 
 4953 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4954   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4955   match(Set dst (AddReductionVD dst src));
 4956   match(Set dst (MulReductionVD dst src));
 4957   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4958   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4959   ins_encode %{
 4960     int opcode = this->ideal_Opcode();
 4961     int vlen = Matcher::vector_length(this, $src);
 4962     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4963   %}
 4964   ins_pipe( pipe_slow );
 4965 %}
 4966 
 4967 // =======================Byte Reduction==========================================
 4968 
 4969 #ifdef _LP64
 4970 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4971   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 4972   match(Set dst (AddReductionVI src1 src2));
 4973   match(Set dst (AndReductionV  src1 src2));
 4974   match(Set dst ( OrReductionV  src1 src2));
 4975   match(Set dst (XorReductionV  src1 src2));
 4976   match(Set dst (MinReductionV  src1 src2));
 4977   match(Set dst (MaxReductionV  src1 src2));
 4978   effect(TEMP vtmp1, TEMP vtmp2);
 4979   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4980   ins_encode %{
 4981     int opcode = this->ideal_Opcode();
 4982     int vlen = Matcher::vector_length(this, $src2);
 4983     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4984   %}
 4985   ins_pipe( pipe_slow );
 4986 %}
 4987 
 4988 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 4989   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 4990   match(Set dst (AddReductionVI src1 src2));
 4991   match(Set dst (AndReductionV  src1 src2));
 4992   match(Set dst ( OrReductionV  src1 src2));
 4993   match(Set dst (XorReductionV  src1 src2));
 4994   match(Set dst (MinReductionV  src1 src2));
 4995   match(Set dst (MaxReductionV  src1 src2));
 4996   effect(TEMP vtmp1, TEMP vtmp2);
 4997   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4998   ins_encode %{
 4999     int opcode = this->ideal_Opcode();
 5000     int vlen = Matcher::vector_length(this, $src2);
 5001     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5002   %}
 5003   ins_pipe( pipe_slow );
 5004 %}
 5005 #endif
 5006 
 5007 // =======================Short Reduction==========================================
 5008 
 5009 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5010   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 5011   match(Set dst (AddReductionVI src1 src2));
 5012   match(Set dst (MulReductionVI src1 src2));
 5013   match(Set dst (AndReductionV  src1 src2));
 5014   match(Set dst ( OrReductionV  src1 src2));
 5015   match(Set dst (XorReductionV  src1 src2));
 5016   match(Set dst (MinReductionV  src1 src2));
 5017   match(Set dst (MaxReductionV  src1 src2));
 5018   effect(TEMP vtmp1, TEMP vtmp2);
 5019   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 5020   ins_encode %{
 5021     int opcode = this->ideal_Opcode();
 5022     int vlen = Matcher::vector_length(this, $src2);
 5023     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5024   %}
 5025   ins_pipe( pipe_slow );
 5026 %}
 5027 
 5028 // =======================Mul Reduction==========================================
 5029 
 5030 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 5031   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5032             Matcher::vector_length(n->in(2)) <= 32); // src2
 5033   match(Set dst (MulReductionVI src1 src2));
 5034   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5035   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5036   ins_encode %{
 5037     int opcode = this->ideal_Opcode();
 5038     int vlen = Matcher::vector_length(this, $src2);
 5039     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5040   %}
 5041   ins_pipe( pipe_slow );
 5042 %}
 5043 
 5044 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 5045   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 5046             Matcher::vector_length(n->in(2)) == 64); // src2
 5047   match(Set dst (MulReductionVI src1 src2));
 5048   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 5049   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 5050   ins_encode %{
 5051     int opcode = this->ideal_Opcode();
 5052     int vlen = Matcher::vector_length(this, $src2);
 5053     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 5054   %}
 5055   ins_pipe( pipe_slow );
 5056 %}
 5057 
 5058 //--------------------Min/Max Float Reduction --------------------
 5059 // Float Min Reduction
 5060 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 5061                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5062   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5063             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5064              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5065             Matcher::vector_length(n->in(2)) == 2);
 5066   match(Set dst (MinReductionV src1 src2));
 5067   match(Set dst (MaxReductionV src1 src2));
 5068   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5069   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5070   ins_encode %{
 5071     assert(UseAVX > 0, "sanity");
 5072 
 5073     int opcode = this->ideal_Opcode();
 5074     int vlen = Matcher::vector_length(this, $src2);
 5075     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5076                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5077   %}
 5078   ins_pipe( pipe_slow );
 5079 %}
 5080 
 5081 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 5082                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5083   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5084             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 5085              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5086             Matcher::vector_length(n->in(2)) >= 4);
 5087   match(Set dst (MinReductionV src1 src2));
 5088   match(Set dst (MaxReductionV src1 src2));
 5089   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5090   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5091   ins_encode %{
 5092     assert(UseAVX > 0, "sanity");
 5093 
 5094     int opcode = this->ideal_Opcode();
 5095     int vlen = Matcher::vector_length(this, $src2);
 5096     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5097                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5098   %}
 5099   ins_pipe( pipe_slow );
 5100 %}
 5101 
 5102 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5103                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5104   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5105             Matcher::vector_length(n->in(2)) == 2);
 5106   match(Set dst (MinReductionV dst src));
 5107   match(Set dst (MaxReductionV dst src));
 5108   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5109   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5110   ins_encode %{
 5111     assert(UseAVX > 0, "sanity");
 5112 
 5113     int opcode = this->ideal_Opcode();
 5114     int vlen = Matcher::vector_length(this, $src);
 5115     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5116                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5117   %}
 5118   ins_pipe( pipe_slow );
 5119 %}
 5120 
 5121 
 5122 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5123                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5124   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5125             Matcher::vector_length(n->in(2)) >= 4);
 5126   match(Set dst (MinReductionV dst src));
 5127   match(Set dst (MaxReductionV dst src));
 5128   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5129   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5130   ins_encode %{
 5131     assert(UseAVX > 0, "sanity");
 5132 
 5133     int opcode = this->ideal_Opcode();
 5134     int vlen = Matcher::vector_length(this, $src);
 5135     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5136                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5137   %}
 5138   ins_pipe( pipe_slow );
 5139 %}
 5140 
 5141 
 5142 //--------------------Min Double Reduction --------------------
 5143 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5144                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5145                             rFlagsReg cr) %{
 5146   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5147             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5148              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5149             Matcher::vector_length(n->in(2)) == 2);
 5150   match(Set dst (MinReductionV src1 src2));
 5151   match(Set dst (MaxReductionV src1 src2));
 5152   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5153   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5154   ins_encode %{
 5155     assert(UseAVX > 0, "sanity");
 5156 
 5157     int opcode = this->ideal_Opcode();
 5158     int vlen = Matcher::vector_length(this, $src2);
 5159     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5160                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5161   %}
 5162   ins_pipe( pipe_slow );
 5163 %}
 5164 
 5165 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5166                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5167                            rFlagsReg cr) %{
 5168   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5169             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5170              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5171             Matcher::vector_length(n->in(2)) >= 4);
 5172   match(Set dst (MinReductionV src1 src2));
 5173   match(Set dst (MaxReductionV src1 src2));
 5174   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5175   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5176   ins_encode %{
 5177     assert(UseAVX > 0, "sanity");
 5178 
 5179     int opcode = this->ideal_Opcode();
 5180     int vlen = Matcher::vector_length(this, $src2);
 5181     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5182                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5183   %}
 5184   ins_pipe( pipe_slow );
 5185 %}
 5186 
 5187 
 5188 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5189                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5190                                rFlagsReg cr) %{
 5191   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5192             Matcher::vector_length(n->in(2)) == 2);
 5193   match(Set dst (MinReductionV dst src));
 5194   match(Set dst (MaxReductionV dst src));
 5195   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5196   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5197   ins_encode %{
 5198     assert(UseAVX > 0, "sanity");
 5199 
 5200     int opcode = this->ideal_Opcode();
 5201     int vlen = Matcher::vector_length(this, $src);
 5202     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5203                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5204   %}
 5205   ins_pipe( pipe_slow );
 5206 %}
 5207 
 5208 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5209                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5210                               rFlagsReg cr) %{
 5211   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5212             Matcher::vector_length(n->in(2)) >= 4);
 5213   match(Set dst (MinReductionV dst src));
 5214   match(Set dst (MaxReductionV dst src));
 5215   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5216   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5217   ins_encode %{
 5218     assert(UseAVX > 0, "sanity");
 5219 
 5220     int opcode = this->ideal_Opcode();
 5221     int vlen = Matcher::vector_length(this, $src);
 5222     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5223                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5224   %}
 5225   ins_pipe( pipe_slow );
 5226 %}
 5227 
 5228 // ====================VECTOR ARITHMETIC=======================================
 5229 
 5230 // --------------------------------- ADD --------------------------------------
 5231 
 5232 // Bytes vector add
 5233 instruct vaddB(vec dst, vec src) %{
 5234   predicate(UseAVX == 0);
 5235   match(Set dst (AddVB dst src));
 5236   format %{ "paddb   $dst,$src\t! add packedB" %}
 5237   ins_encode %{
 5238     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5239   %}
 5240   ins_pipe( pipe_slow );
 5241 %}
 5242 
 5243 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5244   predicate(UseAVX > 0);
 5245   match(Set dst (AddVB src1 src2));
 5246   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5247   ins_encode %{
 5248     int vlen_enc = vector_length_encoding(this);
 5249     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5250   %}
 5251   ins_pipe( pipe_slow );
 5252 %}
 5253 
 5254 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5255   predicate((UseAVX > 0) &&
 5256             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5257   match(Set dst (AddVB src (LoadVector mem)));
 5258   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5259   ins_encode %{
 5260     int vlen_enc = vector_length_encoding(this);
 5261     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5262   %}
 5263   ins_pipe( pipe_slow );
 5264 %}
 5265 
 5266 // Shorts/Chars vector add
 5267 instruct vaddS(vec dst, vec src) %{
 5268   predicate(UseAVX == 0);
 5269   match(Set dst (AddVS dst src));
 5270   format %{ "paddw   $dst,$src\t! add packedS" %}
 5271   ins_encode %{
 5272     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5273   %}
 5274   ins_pipe( pipe_slow );
 5275 %}
 5276 
 5277 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5278   predicate(UseAVX > 0);
 5279   match(Set dst (AddVS src1 src2));
 5280   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5281   ins_encode %{
 5282     int vlen_enc = vector_length_encoding(this);
 5283     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5284   %}
 5285   ins_pipe( pipe_slow );
 5286 %}
 5287 
 5288 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5289   predicate((UseAVX > 0) &&
 5290             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5291   match(Set dst (AddVS src (LoadVector mem)));
 5292   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5293   ins_encode %{
 5294     int vlen_enc = vector_length_encoding(this);
 5295     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5296   %}
 5297   ins_pipe( pipe_slow );
 5298 %}
 5299 
 5300 // Integers vector add
 5301 instruct vaddI(vec dst, vec src) %{
 5302   predicate(UseAVX == 0);
 5303   match(Set dst (AddVI dst src));
 5304   format %{ "paddd   $dst,$src\t! add packedI" %}
 5305   ins_encode %{
 5306     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5307   %}
 5308   ins_pipe( pipe_slow );
 5309 %}
 5310 
 5311 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5312   predicate(UseAVX > 0);
 5313   match(Set dst (AddVI src1 src2));
 5314   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5315   ins_encode %{
 5316     int vlen_enc = vector_length_encoding(this);
 5317     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5318   %}
 5319   ins_pipe( pipe_slow );
 5320 %}
 5321 
 5322 
 5323 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5324   predicate((UseAVX > 0) &&
 5325             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5326   match(Set dst (AddVI src (LoadVector mem)));
 5327   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5328   ins_encode %{
 5329     int vlen_enc = vector_length_encoding(this);
 5330     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5331   %}
 5332   ins_pipe( pipe_slow );
 5333 %}
 5334 
 5335 // Longs vector add
 5336 instruct vaddL(vec dst, vec src) %{
 5337   predicate(UseAVX == 0);
 5338   match(Set dst (AddVL dst src));
 5339   format %{ "paddq   $dst,$src\t! add packedL" %}
 5340   ins_encode %{
 5341     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5342   %}
 5343   ins_pipe( pipe_slow );
 5344 %}
 5345 
 5346 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5347   predicate(UseAVX > 0);
 5348   match(Set dst (AddVL src1 src2));
 5349   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5350   ins_encode %{
 5351     int vlen_enc = vector_length_encoding(this);
 5352     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5353   %}
 5354   ins_pipe( pipe_slow );
 5355 %}
 5356 
 5357 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5358   predicate((UseAVX > 0) &&
 5359             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5360   match(Set dst (AddVL src (LoadVector mem)));
 5361   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5362   ins_encode %{
 5363     int vlen_enc = vector_length_encoding(this);
 5364     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5365   %}
 5366   ins_pipe( pipe_slow );
 5367 %}
 5368 
 5369 // Floats vector add
 5370 instruct vaddF(vec dst, vec src) %{
 5371   predicate(UseAVX == 0);
 5372   match(Set dst (AddVF dst src));
 5373   format %{ "addps   $dst,$src\t! add packedF" %}
 5374   ins_encode %{
 5375     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5376   %}
 5377   ins_pipe( pipe_slow );
 5378 %}
 5379 
 5380 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5381   predicate(UseAVX > 0);
 5382   match(Set dst (AddVF src1 src2));
 5383   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5384   ins_encode %{
 5385     int vlen_enc = vector_length_encoding(this);
 5386     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5387   %}
 5388   ins_pipe( pipe_slow );
 5389 %}
 5390 
 5391 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5392   predicate((UseAVX > 0) &&
 5393             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5394   match(Set dst (AddVF src (LoadVector mem)));
 5395   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5396   ins_encode %{
 5397     int vlen_enc = vector_length_encoding(this);
 5398     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5399   %}
 5400   ins_pipe( pipe_slow );
 5401 %}
 5402 
 5403 // Doubles vector add
 5404 instruct vaddD(vec dst, vec src) %{
 5405   predicate(UseAVX == 0);
 5406   match(Set dst (AddVD dst src));
 5407   format %{ "addpd   $dst,$src\t! add packedD" %}
 5408   ins_encode %{
 5409     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5410   %}
 5411   ins_pipe( pipe_slow );
 5412 %}
 5413 
 5414 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5415   predicate(UseAVX > 0);
 5416   match(Set dst (AddVD src1 src2));
 5417   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5418   ins_encode %{
 5419     int vlen_enc = vector_length_encoding(this);
 5420     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5421   %}
 5422   ins_pipe( pipe_slow );
 5423 %}
 5424 
 5425 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5426   predicate((UseAVX > 0) &&
 5427             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5428   match(Set dst (AddVD src (LoadVector mem)));
 5429   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5430   ins_encode %{
 5431     int vlen_enc = vector_length_encoding(this);
 5432     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5433   %}
 5434   ins_pipe( pipe_slow );
 5435 %}
 5436 
 5437 // --------------------------------- SUB --------------------------------------
 5438 
 5439 // Bytes vector sub
 5440 instruct vsubB(vec dst, vec src) %{
 5441   predicate(UseAVX == 0);
 5442   match(Set dst (SubVB dst src));
 5443   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5444   ins_encode %{
 5445     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5446   %}
 5447   ins_pipe( pipe_slow );
 5448 %}
 5449 
 5450 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5451   predicate(UseAVX > 0);
 5452   match(Set dst (SubVB src1 src2));
 5453   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5454   ins_encode %{
 5455     int vlen_enc = vector_length_encoding(this);
 5456     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5457   %}
 5458   ins_pipe( pipe_slow );
 5459 %}
 5460 
 5461 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5462   predicate((UseAVX > 0) &&
 5463             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5464   match(Set dst (SubVB src (LoadVector mem)));
 5465   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5466   ins_encode %{
 5467     int vlen_enc = vector_length_encoding(this);
 5468     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5469   %}
 5470   ins_pipe( pipe_slow );
 5471 %}
 5472 
 5473 // Shorts/Chars vector sub
 5474 instruct vsubS(vec dst, vec src) %{
 5475   predicate(UseAVX == 0);
 5476   match(Set dst (SubVS dst src));
 5477   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5478   ins_encode %{
 5479     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5480   %}
 5481   ins_pipe( pipe_slow );
 5482 %}
 5483 
 5484 
 5485 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5486   predicate(UseAVX > 0);
 5487   match(Set dst (SubVS src1 src2));
 5488   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5489   ins_encode %{
 5490     int vlen_enc = vector_length_encoding(this);
 5491     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5492   %}
 5493   ins_pipe( pipe_slow );
 5494 %}
 5495 
 5496 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5497   predicate((UseAVX > 0) &&
 5498             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5499   match(Set dst (SubVS src (LoadVector mem)));
 5500   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5501   ins_encode %{
 5502     int vlen_enc = vector_length_encoding(this);
 5503     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5504   %}
 5505   ins_pipe( pipe_slow );
 5506 %}
 5507 
 5508 // Integers vector sub
 5509 instruct vsubI(vec dst, vec src) %{
 5510   predicate(UseAVX == 0);
 5511   match(Set dst (SubVI dst src));
 5512   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5513   ins_encode %{
 5514     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5515   %}
 5516   ins_pipe( pipe_slow );
 5517 %}
 5518 
 5519 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5520   predicate(UseAVX > 0);
 5521   match(Set dst (SubVI src1 src2));
 5522   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5523   ins_encode %{
 5524     int vlen_enc = vector_length_encoding(this);
 5525     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5526   %}
 5527   ins_pipe( pipe_slow );
 5528 %}
 5529 
 5530 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5531   predicate((UseAVX > 0) &&
 5532             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5533   match(Set dst (SubVI src (LoadVector mem)));
 5534   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5535   ins_encode %{
 5536     int vlen_enc = vector_length_encoding(this);
 5537     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5538   %}
 5539   ins_pipe( pipe_slow );
 5540 %}
 5541 
 5542 // Longs vector sub
 5543 instruct vsubL(vec dst, vec src) %{
 5544   predicate(UseAVX == 0);
 5545   match(Set dst (SubVL dst src));
 5546   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5547   ins_encode %{
 5548     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5549   %}
 5550   ins_pipe( pipe_slow );
 5551 %}
 5552 
 5553 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5554   predicate(UseAVX > 0);
 5555   match(Set dst (SubVL src1 src2));
 5556   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5557   ins_encode %{
 5558     int vlen_enc = vector_length_encoding(this);
 5559     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5560   %}
 5561   ins_pipe( pipe_slow );
 5562 %}
 5563 
 5564 
 5565 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5566   predicate((UseAVX > 0) &&
 5567             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5568   match(Set dst (SubVL src (LoadVector mem)));
 5569   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5570   ins_encode %{
 5571     int vlen_enc = vector_length_encoding(this);
 5572     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5573   %}
 5574   ins_pipe( pipe_slow );
 5575 %}
 5576 
 5577 // Floats vector sub
 5578 instruct vsubF(vec dst, vec src) %{
 5579   predicate(UseAVX == 0);
 5580   match(Set dst (SubVF dst src));
 5581   format %{ "subps   $dst,$src\t! sub packedF" %}
 5582   ins_encode %{
 5583     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5584   %}
 5585   ins_pipe( pipe_slow );
 5586 %}
 5587 
 5588 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5589   predicate(UseAVX > 0);
 5590   match(Set dst (SubVF src1 src2));
 5591   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5592   ins_encode %{
 5593     int vlen_enc = vector_length_encoding(this);
 5594     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5595   %}
 5596   ins_pipe( pipe_slow );
 5597 %}
 5598 
 5599 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5600   predicate((UseAVX > 0) &&
 5601             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5602   match(Set dst (SubVF src (LoadVector mem)));
 5603   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5604   ins_encode %{
 5605     int vlen_enc = vector_length_encoding(this);
 5606     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5607   %}
 5608   ins_pipe( pipe_slow );
 5609 %}
 5610 
 5611 // Doubles vector sub
 5612 instruct vsubD(vec dst, vec src) %{
 5613   predicate(UseAVX == 0);
 5614   match(Set dst (SubVD dst src));
 5615   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5616   ins_encode %{
 5617     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5618   %}
 5619   ins_pipe( pipe_slow );
 5620 %}
 5621 
 5622 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5623   predicate(UseAVX > 0);
 5624   match(Set dst (SubVD src1 src2));
 5625   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5626   ins_encode %{
 5627     int vlen_enc = vector_length_encoding(this);
 5628     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5629   %}
 5630   ins_pipe( pipe_slow );
 5631 %}
 5632 
 5633 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5634   predicate((UseAVX > 0) &&
 5635             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5636   match(Set dst (SubVD src (LoadVector mem)));
 5637   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5638   ins_encode %{
 5639     int vlen_enc = vector_length_encoding(this);
 5640     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5641   %}
 5642   ins_pipe( pipe_slow );
 5643 %}
 5644 
 5645 // --------------------------------- MUL --------------------------------------
 5646 
 5647 // Byte vector mul
 5648 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp) %{
 5649   predicate(Matcher::vector_length(n) == 4 ||
 5650             Matcher::vector_length(n) == 8);
 5651   match(Set dst (MulVB src1 src2));
 5652   effect(TEMP dst, TEMP tmp);
 5653   format %{"vector_mulB $dst,$src1,$src2" %}
 5654   ins_encode %{
 5655     assert(UseSSE > 3, "required");
 5656     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
 5657     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
 5658     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
 5659     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 5660     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 5661     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5662   %}
 5663   ins_pipe( pipe_slow );
 5664 %}
 5665 
 5666 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2) %{
 5667   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
 5668   match(Set dst (MulVB src1 src2));
 5669   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 5670   format %{"vector_mulB $dst,$src1,$src2" %}
 5671   ins_encode %{
 5672     assert(UseSSE > 3, "required");
 5673     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
 5674     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
 5675     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
 5676     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
 5677     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
 5678     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
 5679     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
 5680     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
 5681     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 5682     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 5683     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 5684     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 5685   %}
 5686   ins_pipe( pipe_slow );
 5687 %}
 5688 
 5689 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp) %{
 5690   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
 5691   match(Set dst (MulVB src1 src2));
 5692   effect(TEMP dst, TEMP tmp);
 5693   format %{"vector_mulB $dst,$src1,$src2" %}
 5694   ins_encode %{
 5695   int vlen_enc = Assembler::AVX_256bit;
 5696     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 5697     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5698     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5699     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 5700     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 5701     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
 5702     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
 5703   %}
 5704   ins_pipe( pipe_slow );
 5705 %}
 5706 
 5707 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2) %{
 5708   predicate(Matcher::vector_length(n) == 32);
 5709   match(Set dst (MulVB src1 src2));
 5710   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 5711   format %{"vector_mulB $dst,$src1,$src2" %}
 5712   ins_encode %{
 5713     assert(UseAVX > 1, "required");
 5714     int vlen_enc = Assembler::AVX_256bit;
 5715     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
 5716     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
 5717     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5718     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5719     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5720     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 5721     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5722     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5723     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 5724     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5725     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5726     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 5727     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5728     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 5729   %}
 5730   ins_pipe( pipe_slow );
 5731 %}
 5732 
 5733 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2) %{
 5734   predicate(Matcher::vector_length(n) == 64);
 5735   match(Set dst (MulVB src1 src2));
 5736   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 5737   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
 5738   ins_encode %{
 5739     assert(UseAVX > 2, "required");
 5740     int vlen_enc = Assembler::AVX_512bit;
 5741     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
 5742     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
 5743     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5744     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5745     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5746     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 5747     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5748     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5749     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 5750     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5751     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5752     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5753     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 5754     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 5755     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5756   %}
 5757   ins_pipe( pipe_slow );
 5758 %}
 5759 
 5760 // Shorts/Chars vector mul
 5761 instruct vmulS(vec dst, vec src) %{
 5762   predicate(UseAVX == 0);
 5763   match(Set dst (MulVS dst src));
 5764   format %{ "pmullw $dst,$src\t! mul packedS" %}
 5765   ins_encode %{
 5766     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5767   %}
 5768   ins_pipe( pipe_slow );
 5769 %}
 5770 
 5771 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5772   predicate(UseAVX > 0);
 5773   match(Set dst (MulVS src1 src2));
 5774   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5775   ins_encode %{
 5776     int vlen_enc = vector_length_encoding(this);
 5777     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5778   %}
 5779   ins_pipe( pipe_slow );
 5780 %}
 5781 
 5782 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5783   predicate((UseAVX > 0) &&
 5784             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5785   match(Set dst (MulVS src (LoadVector mem)));
 5786   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5787   ins_encode %{
 5788     int vlen_enc = vector_length_encoding(this);
 5789     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5790   %}
 5791   ins_pipe( pipe_slow );
 5792 %}
 5793 
 5794 // Integers vector mul
 5795 instruct vmulI(vec dst, vec src) %{
 5796   predicate(UseAVX == 0);
 5797   match(Set dst (MulVI dst src));
 5798   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5799   ins_encode %{
 5800     assert(UseSSE > 3, "required");
 5801     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5802   %}
 5803   ins_pipe( pipe_slow );
 5804 %}
 5805 
 5806 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5807   predicate(UseAVX > 0);
 5808   match(Set dst (MulVI src1 src2));
 5809   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5810   ins_encode %{
 5811     int vlen_enc = vector_length_encoding(this);
 5812     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5813   %}
 5814   ins_pipe( pipe_slow );
 5815 %}
 5816 
 5817 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5818   predicate((UseAVX > 0) &&
 5819             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5820   match(Set dst (MulVI src (LoadVector mem)));
 5821   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5822   ins_encode %{
 5823     int vlen_enc = vector_length_encoding(this);
 5824     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5825   %}
 5826   ins_pipe( pipe_slow );
 5827 %}
 5828 
 5829 // Longs vector mul
 5830 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
 5831   predicate(VM_Version::supports_avx512dq());
 5832   match(Set dst (MulVL src1 src2));
 5833   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
 5834   ins_encode %{
 5835     assert(UseAVX > 2, "required");
 5836     int vlen_enc = vector_length_encoding(this);
 5837     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5838   %}
 5839   ins_pipe( pipe_slow );
 5840 %}
 5841 
 5842 instruct vmulL_mem(vec dst, vec src, memory mem) %{
 5843   predicate(VM_Version::supports_avx512dq() &&
 5844               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5845   match(Set dst (MulVL src (LoadVector mem)));
 5846   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
 5847   ins_encode %{
 5848     assert(UseAVX > 2, "required");
 5849     int vlen_enc = vector_length_encoding(this);
 5850     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5851   %}
 5852   ins_pipe( pipe_slow );
 5853 %}
 5854 
 5855 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
 5856   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
 5857   match(Set dst (MulVL dst src2));
 5858   effect(TEMP dst, TEMP tmp);
 5859   format %{ "pshufd $tmp,$src2, 177\n\t"
 5860             "pmulld $tmp,$dst\n\t"
 5861             "phaddd $tmp,$tmp\n\t"
 5862             "pmovzxdq $tmp,$tmp\n\t"
 5863             "psllq $tmp, 32\n\t"
 5864             "pmuludq $dst,$src2\n\t"
 5865             "paddq $dst,$tmp\n\t! mul packed2L" %}
 5866 
 5867   ins_encode %{
 5868     assert(VM_Version::supports_sse4_1(), "required");
 5869     int vlen_enc = Assembler::AVX_128bit;
 5870     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
 5871     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
 5872     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
 5873     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
 5874     __ psllq($tmp$$XMMRegister, 32);
 5875     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
 5876     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
 5877   %}
 5878   ins_pipe( pipe_slow );
 5879 %}
 5880 
 5881 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
 5882   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
 5883   match(Set dst (MulVL src1 src2));
 5884   effect(TEMP tmp1, TEMP tmp);
 5885   format %{ "vpshufd $tmp,$src2\n\t"
 5886             "vpmulld $tmp,$src1,$tmp\n\t"
 5887             "vphaddd $tmp,$tmp,$tmp\n\t"
 5888             "vpmovzxdq $tmp,$tmp\n\t"
 5889             "vpsllq $tmp,$tmp\n\t"
 5890             "vpmuludq $tmp1,$src1,$src2\n\t"
 5891             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
 5892   ins_encode %{
 5893     int vlen_enc = Assembler::AVX_256bit;
 5894     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
 5895     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 5896     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
 5897     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5898     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 5899     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
 5900     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5901     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5902   %}
 5903   ins_pipe( pipe_slow );
 5904 %}
 5905 
 5906 // Floats vector mul
 5907 instruct vmulF(vec dst, vec src) %{
 5908   predicate(UseAVX == 0);
 5909   match(Set dst (MulVF dst src));
 5910   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5911   ins_encode %{
 5912     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5913   %}
 5914   ins_pipe( pipe_slow );
 5915 %}
 5916 
 5917 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5918   predicate(UseAVX > 0);
 5919   match(Set dst (MulVF src1 src2));
 5920   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5921   ins_encode %{
 5922     int vlen_enc = vector_length_encoding(this);
 5923     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5924   %}
 5925   ins_pipe( pipe_slow );
 5926 %}
 5927 
 5928 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5929   predicate((UseAVX > 0) &&
 5930             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5931   match(Set dst (MulVF src (LoadVector mem)));
 5932   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5933   ins_encode %{
 5934     int vlen_enc = vector_length_encoding(this);
 5935     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5936   %}
 5937   ins_pipe( pipe_slow );
 5938 %}
 5939 
 5940 // Doubles vector mul
 5941 instruct vmulD(vec dst, vec src) %{
 5942   predicate(UseAVX == 0);
 5943   match(Set dst (MulVD dst src));
 5944   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5945   ins_encode %{
 5946     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5947   %}
 5948   ins_pipe( pipe_slow );
 5949 %}
 5950 
 5951 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5952   predicate(UseAVX > 0);
 5953   match(Set dst (MulVD src1 src2));
 5954   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5955   ins_encode %{
 5956     int vlen_enc = vector_length_encoding(this);
 5957     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5958   %}
 5959   ins_pipe( pipe_slow );
 5960 %}
 5961 
 5962 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5963   predicate((UseAVX > 0) &&
 5964             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5965   match(Set dst (MulVD src (LoadVector mem)));
 5966   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 5967   ins_encode %{
 5968     int vlen_enc = vector_length_encoding(this);
 5969     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5970   %}
 5971   ins_pipe( pipe_slow );
 5972 %}
 5973 
 5974 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 5975   predicate(Matcher::vector_length(n) == 8);
 5976   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
 5977   effect(TEMP dst, USE src1, USE src2);
 5978   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
 5979             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
 5980          %}
 5981   ins_encode %{
 5982     assert(UseAVX > 0, "required");
 5983 
 5984     int vlen_enc = Assembler::AVX_256bit;
 5985     int cond = (Assembler::Condition)($copnd$$cmpcode);
 5986     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 5987     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5988   %}
 5989   ins_pipe( pipe_slow );
 5990 %}
 5991 
 5992 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 5993   predicate(Matcher::vector_length(n) == 4);
 5994   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
 5995   effect(TEMP dst, USE src1, USE src2);
 5996   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
 5997             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
 5998          %}
 5999   ins_encode %{
 6000     assert(UseAVX > 0, "required");
 6001 
 6002     int vlen_enc = Assembler::AVX_256bit;
 6003     int cond = (Assembler::Condition)($copnd$$cmpcode);
 6004     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 6005     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6006   %}
 6007   ins_pipe( pipe_slow );
 6008 %}
 6009 
 6010 // --------------------------------- DIV --------------------------------------
 6011 
 6012 // Floats vector div
 6013 instruct vdivF(vec dst, vec src) %{
 6014   predicate(UseAVX == 0);
 6015   match(Set dst (DivVF dst src));
 6016   format %{ "divps   $dst,$src\t! div packedF" %}
 6017   ins_encode %{
 6018     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 6019   %}
 6020   ins_pipe( pipe_slow );
 6021 %}
 6022 
 6023 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 6024   predicate(UseAVX > 0);
 6025   match(Set dst (DivVF src1 src2));
 6026   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 6027   ins_encode %{
 6028     int vlen_enc = vector_length_encoding(this);
 6029     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6030   %}
 6031   ins_pipe( pipe_slow );
 6032 %}
 6033 
 6034 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 6035   predicate((UseAVX > 0) &&
 6036             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6037   match(Set dst (DivVF src (LoadVector mem)));
 6038   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 6039   ins_encode %{
 6040     int vlen_enc = vector_length_encoding(this);
 6041     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6042   %}
 6043   ins_pipe( pipe_slow );
 6044 %}
 6045 
 6046 // Doubles vector div
 6047 instruct vdivD(vec dst, vec src) %{
 6048   predicate(UseAVX == 0);
 6049   match(Set dst (DivVD dst src));
 6050   format %{ "divpd   $dst,$src\t! div packedD" %}
 6051   ins_encode %{
 6052     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 6053   %}
 6054   ins_pipe( pipe_slow );
 6055 %}
 6056 
 6057 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 6058   predicate(UseAVX > 0);
 6059   match(Set dst (DivVD src1 src2));
 6060   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 6061   ins_encode %{
 6062     int vlen_enc = vector_length_encoding(this);
 6063     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6064   %}
 6065   ins_pipe( pipe_slow );
 6066 %}
 6067 
 6068 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 6069   predicate((UseAVX > 0) &&
 6070             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6071   match(Set dst (DivVD src (LoadVector mem)));
 6072   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 6073   ins_encode %{
 6074     int vlen_enc = vector_length_encoding(this);
 6075     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6076   %}
 6077   ins_pipe( pipe_slow );
 6078 %}
 6079 
 6080 // ------------------------------ MinMax ---------------------------------------
 6081 
 6082 // Byte, Short, Int vector Min/Max
 6083 instruct minmax_reg_sse(vec dst, vec src) %{
 6084   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6085             UseAVX == 0);
 6086   match(Set dst (MinV dst src));
 6087   match(Set dst (MaxV dst src));
 6088   format %{ "vector_minmax  $dst,$src\t!  " %}
 6089   ins_encode %{
 6090     assert(UseSSE >= 4, "required");
 6091 
 6092     int opcode = this->ideal_Opcode();
 6093     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6094     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6095   %}
 6096   ins_pipe( pipe_slow );
 6097 %}
 6098 
 6099 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6100   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6101             UseAVX > 0);
 6102   match(Set dst (MinV src1 src2));
 6103   match(Set dst (MaxV src1 src2));
 6104   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6105   ins_encode %{
 6106     int opcode = this->ideal_Opcode();
 6107     int vlen_enc = vector_length_encoding(this);
 6108     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6109 
 6110     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6111   %}
 6112   ins_pipe( pipe_slow );
 6113 %}
 6114 
 6115 // Long vector Min/Max
 6116 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6117   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6118             UseAVX == 0);
 6119   match(Set dst (MinV dst src));
 6120   match(Set dst (MaxV src dst));
 6121   effect(TEMP dst, TEMP tmp);
 6122   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6123   ins_encode %{
 6124     assert(UseSSE >= 4, "required");
 6125 
 6126     int opcode = this->ideal_Opcode();
 6127     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6128     assert(elem_bt == T_LONG, "sanity");
 6129 
 6130     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6131   %}
 6132   ins_pipe( pipe_slow );
 6133 %}
 6134 
 6135 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6136   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6137             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6138   match(Set dst (MinV src1 src2));
 6139   match(Set dst (MaxV src1 src2));
 6140   effect(TEMP dst);
 6141   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6142   ins_encode %{
 6143     int vlen_enc = vector_length_encoding(this);
 6144     int opcode = this->ideal_Opcode();
 6145     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6146     assert(elem_bt == T_LONG, "sanity");
 6147 
 6148     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6149   %}
 6150   ins_pipe( pipe_slow );
 6151 %}
 6152 
 6153 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6154   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6155             Matcher::vector_element_basic_type(n) == T_LONG);
 6156   match(Set dst (MinV src1 src2));
 6157   match(Set dst (MaxV src1 src2));
 6158   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6159   ins_encode %{
 6160     assert(UseAVX > 2, "required");
 6161 
 6162     int vlen_enc = vector_length_encoding(this);
 6163     int opcode = this->ideal_Opcode();
 6164     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6165     assert(elem_bt == T_LONG, "sanity");
 6166 
 6167     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6168   %}
 6169   ins_pipe( pipe_slow );
 6170 %}
 6171 
 6172 // Float/Double vector Min/Max
 6173 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6174   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6175             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6176             UseAVX > 0);
 6177   match(Set dst (MinV a b));
 6178   match(Set dst (MaxV a b));
 6179   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6180   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6181   ins_encode %{
 6182     assert(UseAVX > 0, "required");
 6183 
 6184     int opcode = this->ideal_Opcode();
 6185     int vlen_enc = vector_length_encoding(this);
 6186     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6187 
 6188     __ vminmax_fp(opcode, elem_bt,
 6189                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6190                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6191   %}
 6192   ins_pipe( pipe_slow );
 6193 %}
 6194 
 6195 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6196   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6197             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6198   match(Set dst (MinV a b));
 6199   match(Set dst (MaxV a b));
 6200   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6201   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6202   ins_encode %{
 6203     assert(UseAVX > 2, "required");
 6204 
 6205     int opcode = this->ideal_Opcode();
 6206     int vlen_enc = vector_length_encoding(this);
 6207     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6208 
 6209     __ evminmax_fp(opcode, elem_bt,
 6210                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6211                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6212   %}
 6213   ins_pipe( pipe_slow );
 6214 %}
 6215 
 6216 // --------------------------------- Signum/CopySign ---------------------------
 6217 
 6218 instruct signumF_reg(regF dst, regF zero, regF one, rFlagsReg cr) %{
 6219   match(Set dst (SignumF dst (Binary zero one)));
 6220   effect(KILL cr);
 6221   format %{ "signumF $dst, $dst" %}
 6222   ins_encode %{
 6223     int opcode = this->ideal_Opcode();
 6224     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6225   %}
 6226   ins_pipe( pipe_slow );
 6227 %}
 6228 
 6229 instruct signumD_reg(regD dst, regD zero, regD one, rFlagsReg cr) %{
 6230   match(Set dst (SignumD dst (Binary zero one)));
 6231   effect(KILL cr);
 6232   format %{ "signumD $dst, $dst" %}
 6233   ins_encode %{
 6234     int opcode = this->ideal_Opcode();
 6235     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister);
 6236   %}
 6237   ins_pipe( pipe_slow );
 6238 %}
 6239 
 6240 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
 6241   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 6242   match(Set dst (SignumVF src (Binary zero one)));
 6243   match(Set dst (SignumVD src (Binary zero one)));
 6244   effect(TEMP dst, TEMP xtmp1);
 6245   format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
 6246   ins_encode %{
 6247     int opcode = this->ideal_Opcode();
 6248     int vec_enc = vector_length_encoding(this);
 6249     __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6250                          $xtmp1$$XMMRegister, vec_enc);
 6251   %}
 6252   ins_pipe( pipe_slow );
 6253 %}
 6254 
 6255 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
 6256   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 6257   match(Set dst (SignumVF src (Binary zero one)));
 6258   match(Set dst (SignumVD src (Binary zero one)));
 6259   effect(TEMP dst, TEMP ktmp1);
 6260   format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
 6261   ins_encode %{
 6262     int opcode = this->ideal_Opcode();
 6263     int vec_enc = vector_length_encoding(this);
 6264     __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
 6265                           $ktmp1$$KRegister, vec_enc);
 6266   %}
 6267   ins_pipe( pipe_slow );
 6268 %}
 6269 
 6270 // ---------------------------------------
 6271 // For copySign use 0xE4 as writemask for vpternlog
 6272 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6273 // C (xmm2) is set to 0x7FFFFFFF
 6274 // Wherever xmm2 is 0, we want to pick from B (sign)
 6275 // Wherever xmm2 is 1, we want to pick from A (src)
 6276 //
 6277 // A B C Result
 6278 // 0 0 0 0
 6279 // 0 0 1 0
 6280 // 0 1 0 1
 6281 // 0 1 1 0
 6282 // 1 0 0 0
 6283 // 1 0 1 1
 6284 // 1 1 0 1
 6285 // 1 1 1 1
 6286 //
 6287 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6288 // ---------------------------------------
 6289 
 6290 #ifdef _LP64
 6291 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6292   match(Set dst (CopySignF dst src));
 6293   effect(TEMP tmp1, TEMP tmp2);
 6294   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6295   ins_encode %{
 6296     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6297     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6298     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6299   %}
 6300   ins_pipe( pipe_slow );
 6301 %}
 6302 
 6303 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6304   match(Set dst (CopySignD dst (Binary src zero)));
 6305   ins_cost(100);
 6306   effect(TEMP tmp1, TEMP tmp2);
 6307   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6308   ins_encode %{
 6309     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6310     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6311     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6312   %}
 6313   ins_pipe( pipe_slow );
 6314 %}
 6315 
 6316 #endif // _LP64
 6317 
 6318 //----------------------------- CompressBits/ExpandBits ------------------------
 6319 
 6320 instruct compressBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6321   predicate(n->bottom_type()->isa_int());
 6322   match(Set dst (CompressBits src mask));
 6323   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6324   ins_encode %{
 6325     __ pextl($dst$$Register, $src$$Register, $mask$$Register);
 6326   %}
 6327   ins_pipe( pipe_slow );
 6328 %}
 6329 
 6330 instruct expandBitsI_reg(rRegI dst, rRegI src, rRegI mask) %{
 6331   predicate(n->bottom_type()->isa_int());
 6332   match(Set dst (ExpandBits src mask));
 6333   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6334   ins_encode %{
 6335     __ pdepl($dst$$Register, $src$$Register, $mask$$Register);
 6336   %}
 6337   ins_pipe( pipe_slow );
 6338 %}
 6339 
 6340 instruct compressBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6341   predicate(n->bottom_type()->isa_int());
 6342   match(Set dst (CompressBits src (LoadI mask)));
 6343   format %{ "pextl  $dst, $src, $mask\t! parallel bit extract" %}
 6344   ins_encode %{
 6345     __ pextl($dst$$Register, $src$$Register, $mask$$Address);
 6346   %}
 6347   ins_pipe( pipe_slow );
 6348 %}
 6349 
 6350 instruct expandBitsI_mem(rRegI dst, rRegI src, memory mask) %{
 6351   predicate(n->bottom_type()->isa_int());
 6352   match(Set dst (ExpandBits src (LoadI mask)));
 6353   format %{ "pdepl  $dst, $src, $mask\t! parallel bit deposit" %}
 6354   ins_encode %{
 6355     __ pdepl($dst$$Register, $src$$Register, $mask$$Address);
 6356   %}
 6357   ins_pipe( pipe_slow );
 6358 %}
 6359 
 6360 // --------------------------------- Sqrt --------------------------------------
 6361 
 6362 instruct vsqrtF_reg(vec dst, vec src) %{
 6363   match(Set dst (SqrtVF src));
 6364   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6365   ins_encode %{
 6366     assert(UseAVX > 0, "required");
 6367     int vlen_enc = vector_length_encoding(this);
 6368     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6369   %}
 6370   ins_pipe( pipe_slow );
 6371 %}
 6372 
 6373 instruct vsqrtF_mem(vec dst, memory mem) %{
 6374   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6375   match(Set dst (SqrtVF (LoadVector mem)));
 6376   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6377   ins_encode %{
 6378     assert(UseAVX > 0, "required");
 6379     int vlen_enc = vector_length_encoding(this);
 6380     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6381   %}
 6382   ins_pipe( pipe_slow );
 6383 %}
 6384 
 6385 // Floating point vector sqrt
 6386 instruct vsqrtD_reg(vec dst, vec src) %{
 6387   match(Set dst (SqrtVD src));
 6388   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6389   ins_encode %{
 6390     assert(UseAVX > 0, "required");
 6391     int vlen_enc = vector_length_encoding(this);
 6392     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6393   %}
 6394   ins_pipe( pipe_slow );
 6395 %}
 6396 
 6397 instruct vsqrtD_mem(vec dst, memory mem) %{
 6398   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6399   match(Set dst (SqrtVD (LoadVector mem)));
 6400   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6401   ins_encode %{
 6402     assert(UseAVX > 0, "required");
 6403     int vlen_enc = vector_length_encoding(this);
 6404     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6405   %}
 6406   ins_pipe( pipe_slow );
 6407 %}
 6408 
 6409 // ------------------------------ Shift ---------------------------------------
 6410 
 6411 // Left and right shift count vectors are the same on x86
 6412 // (only lowest bits of xmm reg are used for count).
 6413 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6414   match(Set dst (LShiftCntV cnt));
 6415   match(Set dst (RShiftCntV cnt));
 6416   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6417   ins_encode %{
 6418     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6419   %}
 6420   ins_pipe( pipe_slow );
 6421 %}
 6422 
 6423 // Byte vector shift
 6424 instruct vshiftB(vec dst, vec src, vec shift, vec tmp) %{
 6425   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6426   match(Set dst ( LShiftVB src shift));
 6427   match(Set dst ( RShiftVB src shift));
 6428   match(Set dst (URShiftVB src shift));
 6429   effect(TEMP dst, USE src, USE shift, TEMP tmp);
 6430   format %{"vector_byte_shift $dst,$src,$shift" %}
 6431   ins_encode %{
 6432     assert(UseSSE > 3, "required");
 6433     int opcode = this->ideal_Opcode();
 6434     bool sign = (opcode != Op_URShiftVB);
 6435     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6436     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6437     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6438     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6439     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6440   %}
 6441   ins_pipe( pipe_slow );
 6442 %}
 6443 
 6444 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6445   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6446             UseAVX <= 1);
 6447   match(Set dst ( LShiftVB src shift));
 6448   match(Set dst ( RShiftVB src shift));
 6449   match(Set dst (URShiftVB src shift));
 6450   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2);
 6451   format %{"vector_byte_shift $dst,$src,$shift" %}
 6452   ins_encode %{
 6453     assert(UseSSE > 3, "required");
 6454     int opcode = this->ideal_Opcode();
 6455     bool sign = (opcode != Op_URShiftVB);
 6456     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6457     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6458     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6459     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6460     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6461     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6462     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6463     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6464     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6465   %}
 6466   ins_pipe( pipe_slow );
 6467 %}
 6468 
 6469 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6470   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6471             UseAVX > 1);
 6472   match(Set dst ( LShiftVB src shift));
 6473   match(Set dst ( RShiftVB src shift));
 6474   match(Set dst (URShiftVB src shift));
 6475   effect(TEMP dst, TEMP tmp);
 6476   format %{"vector_byte_shift $dst,$src,$shift" %}
 6477   ins_encode %{
 6478     int opcode = this->ideal_Opcode();
 6479     bool sign = (opcode != Op_URShiftVB);
 6480     int vlen_enc = Assembler::AVX_256bit;
 6481     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6482     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6483     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6484     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6485     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6486   %}
 6487   ins_pipe( pipe_slow );
 6488 %}
 6489 
 6490 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp) %{
 6491   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6492   match(Set dst ( LShiftVB src shift));
 6493   match(Set dst ( RShiftVB src shift));
 6494   match(Set dst (URShiftVB src shift));
 6495   effect(TEMP dst, TEMP tmp);
 6496   format %{"vector_byte_shift $dst,$src,$shift" %}
 6497   ins_encode %{
 6498     assert(UseAVX > 1, "required");
 6499     int opcode = this->ideal_Opcode();
 6500     bool sign = (opcode != Op_URShiftVB);
 6501     int vlen_enc = Assembler::AVX_256bit;
 6502     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6503     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6504     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6505     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6506     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6507     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6508     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 6509     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6510     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6511   %}
 6512   ins_pipe( pipe_slow );
 6513 %}
 6514 
 6515 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2) %{
 6516   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6517   match(Set dst ( LShiftVB src shift));
 6518   match(Set dst  (RShiftVB src shift));
 6519   match(Set dst (URShiftVB src shift));
 6520   effect(TEMP dst, TEMP tmp1, TEMP tmp2);
 6521   format %{"vector_byte_shift $dst,$src,$shift" %}
 6522   ins_encode %{
 6523     assert(UseAVX > 2, "required");
 6524     int opcode = this->ideal_Opcode();
 6525     bool sign = (opcode != Op_URShiftVB);
 6526     int vlen_enc = Assembler::AVX_512bit;
 6527     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6528     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6529     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6530     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6531     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6532     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), noreg);
 6533     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6534     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6535     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6536     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6537     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, noreg);
 6538     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6539   %}
 6540   ins_pipe( pipe_slow );
 6541 %}
 6542 
 6543 // Shorts vector logical right shift produces incorrect Java result
 6544 // for negative data because java code convert short value into int with
 6545 // sign extension before a shift. But char vectors are fine since chars are
 6546 // unsigned values.
 6547 // Shorts/Chars vector left shift
 6548 instruct vshiftS(vec dst, vec src, vec shift) %{
 6549   predicate(!n->as_ShiftV()->is_var_shift());
 6550   match(Set dst ( LShiftVS src shift));
 6551   match(Set dst ( RShiftVS src shift));
 6552   match(Set dst (URShiftVS src shift));
 6553   effect(TEMP dst, USE src, USE shift);
 6554   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6555   ins_encode %{
 6556     int opcode = this->ideal_Opcode();
 6557     if (UseAVX > 0) {
 6558       int vlen_enc = vector_length_encoding(this);
 6559       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6560     } else {
 6561       int vlen = Matcher::vector_length(this);
 6562       if (vlen == 2) {
 6563         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6564         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6565       } else if (vlen == 4) {
 6566         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6567         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6568       } else {
 6569         assert (vlen == 8, "sanity");
 6570         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6571         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6572       }
 6573     }
 6574   %}
 6575   ins_pipe( pipe_slow );
 6576 %}
 6577 
 6578 // Integers vector left shift
 6579 instruct vshiftI(vec dst, vec src, vec shift) %{
 6580   predicate(!n->as_ShiftV()->is_var_shift());
 6581   match(Set dst ( LShiftVI src shift));
 6582   match(Set dst ( RShiftVI src shift));
 6583   match(Set dst (URShiftVI src shift));
 6584   effect(TEMP dst, USE src, USE shift);
 6585   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6586   ins_encode %{
 6587     int opcode = this->ideal_Opcode();
 6588     if (UseAVX > 0) {
 6589       int vlen_enc = vector_length_encoding(this);
 6590       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6591     } else {
 6592       int vlen = Matcher::vector_length(this);
 6593       if (vlen == 2) {
 6594         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6595         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6596       } else {
 6597         assert(vlen == 4, "sanity");
 6598         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6599         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6600       }
 6601     }
 6602   %}
 6603   ins_pipe( pipe_slow );
 6604 %}
 6605 
 6606 // Integers vector left constant shift
 6607 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6608   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6609   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6610   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6611   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6612   ins_encode %{
 6613     int opcode = this->ideal_Opcode();
 6614     if (UseAVX > 0) {
 6615       int vector_len = vector_length_encoding(this);
 6616       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6617     } else {
 6618       int vlen = Matcher::vector_length(this);
 6619       if (vlen == 2) {
 6620         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6621         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6622       } else {
 6623         assert(vlen == 4, "sanity");
 6624         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6625         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6626       }
 6627     }
 6628   %}
 6629   ins_pipe( pipe_slow );
 6630 %}
 6631 
 6632 // Longs vector shift
 6633 instruct vshiftL(vec dst, vec src, vec shift) %{
 6634   predicate(!n->as_ShiftV()->is_var_shift());
 6635   match(Set dst ( LShiftVL src shift));
 6636   match(Set dst (URShiftVL src shift));
 6637   effect(TEMP dst, USE src, USE shift);
 6638   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6639   ins_encode %{
 6640     int opcode = this->ideal_Opcode();
 6641     if (UseAVX > 0) {
 6642       int vlen_enc = vector_length_encoding(this);
 6643       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6644     } else {
 6645       assert(Matcher::vector_length(this) == 2, "");
 6646       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6647       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6648     }
 6649   %}
 6650   ins_pipe( pipe_slow );
 6651 %}
 6652 
 6653 // Longs vector constant shift
 6654 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6655   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6656   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6657   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6658   ins_encode %{
 6659     int opcode = this->ideal_Opcode();
 6660     if (UseAVX > 0) {
 6661       int vector_len = vector_length_encoding(this);
 6662       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6663     } else {
 6664       assert(Matcher::vector_length(this) == 2, "");
 6665       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6666       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6667     }
 6668   %}
 6669   ins_pipe( pipe_slow );
 6670 %}
 6671 
 6672 // -------------------ArithmeticRightShift -----------------------------------
 6673 // Long vector arithmetic right shift
 6674 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp) %{
 6675   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6676   match(Set dst (RShiftVL src shift));
 6677   effect(TEMP dst, TEMP tmp);
 6678   format %{ "vshiftq $dst,$src,$shift" %}
 6679   ins_encode %{
 6680     uint vlen = Matcher::vector_length(this);
 6681     if (vlen == 2) {
 6682       assert(UseSSE >= 2, "required");
 6683       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6684       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6685       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6686       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6687       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6688       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6689     } else {
 6690       assert(vlen == 4, "sanity");
 6691       assert(UseAVX > 1, "required");
 6692       int vlen_enc = Assembler::AVX_256bit;
 6693       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6694       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), noreg);
 6695       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6696       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6697       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6698     }
 6699   %}
 6700   ins_pipe( pipe_slow );
 6701 %}
 6702 
 6703 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6704   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6705   match(Set dst (RShiftVL src shift));
 6706   format %{ "vshiftq $dst,$src,$shift" %}
 6707   ins_encode %{
 6708     int vlen_enc = vector_length_encoding(this);
 6709     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6710   %}
 6711   ins_pipe( pipe_slow );
 6712 %}
 6713 
 6714 // ------------------- Variable Shift -----------------------------
 6715 // Byte variable shift
 6716 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6717   predicate(Matcher::vector_length(n) <= 8 &&
 6718             n->as_ShiftV()->is_var_shift() &&
 6719             !VM_Version::supports_avx512bw());
 6720   match(Set dst ( LShiftVB src shift));
 6721   match(Set dst ( RShiftVB src shift));
 6722   match(Set dst (URShiftVB src shift));
 6723   effect(TEMP dst, TEMP vtmp);
 6724   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6725   ins_encode %{
 6726     assert(UseAVX >= 2, "required");
 6727 
 6728     int opcode = this->ideal_Opcode();
 6729     int vlen_enc = Assembler::AVX_128bit;
 6730     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6731     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6732   %}
 6733   ins_pipe( pipe_slow );
 6734 %}
 6735 
 6736 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6737   predicate(Matcher::vector_length(n) == 16 &&
 6738             n->as_ShiftV()->is_var_shift() &&
 6739             !VM_Version::supports_avx512bw());
 6740   match(Set dst ( LShiftVB src shift));
 6741   match(Set dst ( RShiftVB src shift));
 6742   match(Set dst (URShiftVB src shift));
 6743   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6744   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6745   ins_encode %{
 6746     assert(UseAVX >= 2, "required");
 6747 
 6748     int opcode = this->ideal_Opcode();
 6749     int vlen_enc = Assembler::AVX_128bit;
 6750     // Shift lower half and get word result in dst
 6751     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6752 
 6753     // Shift upper half and get word result in vtmp1
 6754     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6755     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6756     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6757 
 6758     // Merge and down convert the two word results to byte in dst
 6759     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6760   %}
 6761   ins_pipe( pipe_slow );
 6762 %}
 6763 
 6764 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4) %{
 6765   predicate(Matcher::vector_length(n) == 32 &&
 6766             n->as_ShiftV()->is_var_shift() &&
 6767             !VM_Version::supports_avx512bw());
 6768   match(Set dst ( LShiftVB src shift));
 6769   match(Set dst ( RShiftVB src shift));
 6770   match(Set dst (URShiftVB src shift));
 6771   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4);
 6772   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 as TEMP" %}
 6773   ins_encode %{
 6774     assert(UseAVX >= 2, "required");
 6775 
 6776     int opcode = this->ideal_Opcode();
 6777     int vlen_enc = Assembler::AVX_128bit;
 6778     // Process lower 128 bits and get result in dst
 6779     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6780     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6781     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6782     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6783     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6784 
 6785     // Process higher 128 bits and get result in vtmp3
 6786     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6787     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6788     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister);
 6789     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6790     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6791     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6792     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6793 
 6794     // Merge the two results in dst
 6795     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6796   %}
 6797   ins_pipe( pipe_slow );
 6798 %}
 6799 
 6800 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp) %{
 6801   predicate(Matcher::vector_length(n) <= 32 &&
 6802             n->as_ShiftV()->is_var_shift() &&
 6803             VM_Version::supports_avx512bw());
 6804   match(Set dst ( LShiftVB src shift));
 6805   match(Set dst ( RShiftVB src shift));
 6806   match(Set dst (URShiftVB src shift));
 6807   effect(TEMP dst, TEMP vtmp);
 6808   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp as TEMP" %}
 6809   ins_encode %{
 6810     assert(UseAVX > 2, "required");
 6811 
 6812     int opcode = this->ideal_Opcode();
 6813     int vlen_enc = vector_length_encoding(this);
 6814     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister);
 6815   %}
 6816   ins_pipe( pipe_slow );
 6817 %}
 6818 
 6819 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6820   predicate(Matcher::vector_length(n) == 64 &&
 6821             n->as_ShiftV()->is_var_shift() &&
 6822             VM_Version::supports_avx512bw());
 6823   match(Set dst ( LShiftVB src shift));
 6824   match(Set dst ( RShiftVB src shift));
 6825   match(Set dst (URShiftVB src shift));
 6826   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6827   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 as TEMP" %}
 6828   ins_encode %{
 6829     assert(UseAVX > 2, "required");
 6830 
 6831     int opcode = this->ideal_Opcode();
 6832     int vlen_enc = Assembler::AVX_256bit;
 6833     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister);
 6834     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6835     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6836     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister);
 6837     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6838   %}
 6839   ins_pipe( pipe_slow );
 6840 %}
 6841 
 6842 // Short variable shift
 6843 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp) %{
 6844   predicate(Matcher::vector_length(n) <= 8 &&
 6845             n->as_ShiftV()->is_var_shift() &&
 6846             !VM_Version::supports_avx512bw());
 6847   match(Set dst ( LShiftVS src shift));
 6848   match(Set dst ( RShiftVS src shift));
 6849   match(Set dst (URShiftVS src shift));
 6850   effect(TEMP dst, TEMP vtmp);
 6851   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6852   ins_encode %{
 6853     assert(UseAVX >= 2, "required");
 6854 
 6855     int opcode = this->ideal_Opcode();
 6856     bool sign = (opcode != Op_URShiftVS);
 6857     int vlen_enc = Assembler::AVX_256bit;
 6858     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6859     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6860     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6861     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6862     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6863     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6864   %}
 6865   ins_pipe( pipe_slow );
 6866 %}
 6867 
 6868 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2) %{
 6869   predicate(Matcher::vector_length(n) == 16 &&
 6870             n->as_ShiftV()->is_var_shift() &&
 6871             !VM_Version::supports_avx512bw());
 6872   match(Set dst ( LShiftVS src shift));
 6873   match(Set dst ( RShiftVS src shift));
 6874   match(Set dst (URShiftVS src shift));
 6875   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 6876   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6877   ins_encode %{
 6878     assert(UseAVX >= 2, "required");
 6879 
 6880     int opcode = this->ideal_Opcode();
 6881     bool sign = (opcode != Op_URShiftVS);
 6882     int vlen_enc = Assembler::AVX_256bit;
 6883     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6884     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6885     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6886     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6887     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6888 
 6889     // Shift upper half, with result in dst using vtmp1 as TEMP
 6890     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6891     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6892     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6893     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6894     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6895     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 6896 
 6897     // Merge lower and upper half result into dst
 6898     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6899     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6900   %}
 6901   ins_pipe( pipe_slow );
 6902 %}
 6903 
 6904 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6905   predicate(n->as_ShiftV()->is_var_shift() &&
 6906             VM_Version::supports_avx512bw());
 6907   match(Set dst ( LShiftVS src shift));
 6908   match(Set dst ( RShiftVS src shift));
 6909   match(Set dst (URShiftVS src shift));
 6910   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6911   ins_encode %{
 6912     assert(UseAVX > 2, "required");
 6913 
 6914     int opcode = this->ideal_Opcode();
 6915     int vlen_enc = vector_length_encoding(this);
 6916     if (!VM_Version::supports_avx512vl()) {
 6917       vlen_enc = Assembler::AVX_512bit;
 6918     }
 6919     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6920   %}
 6921   ins_pipe( pipe_slow );
 6922 %}
 6923 
 6924 //Integer variable shift
 6925 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6926   predicate(n->as_ShiftV()->is_var_shift());
 6927   match(Set dst ( LShiftVI src shift));
 6928   match(Set dst ( RShiftVI src shift));
 6929   match(Set dst (URShiftVI src shift));
 6930   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6931   ins_encode %{
 6932     assert(UseAVX >= 2, "required");
 6933 
 6934     int opcode = this->ideal_Opcode();
 6935     int vlen_enc = vector_length_encoding(this);
 6936     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6937   %}
 6938   ins_pipe( pipe_slow );
 6939 %}
 6940 
 6941 //Long variable shift
 6942 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6943   predicate(n->as_ShiftV()->is_var_shift());
 6944   match(Set dst ( LShiftVL src shift));
 6945   match(Set dst (URShiftVL src shift));
 6946   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6947   ins_encode %{
 6948     assert(UseAVX >= 2, "required");
 6949 
 6950     int opcode = this->ideal_Opcode();
 6951     int vlen_enc = vector_length_encoding(this);
 6952     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6953   %}
 6954   ins_pipe( pipe_slow );
 6955 %}
 6956 
 6957 //Long variable right shift arithmetic
 6958 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6959   predicate(Matcher::vector_length(n) <= 4 &&
 6960             n->as_ShiftV()->is_var_shift() &&
 6961             UseAVX == 2);
 6962   match(Set dst (RShiftVL src shift));
 6963   effect(TEMP dst, TEMP vtmp);
 6964   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6965   ins_encode %{
 6966     int opcode = this->ideal_Opcode();
 6967     int vlen_enc = vector_length_encoding(this);
 6968     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 6969                  $vtmp$$XMMRegister);
 6970   %}
 6971   ins_pipe( pipe_slow );
 6972 %}
 6973 
 6974 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 6975   predicate(n->as_ShiftV()->is_var_shift() &&
 6976             UseAVX > 2);
 6977   match(Set dst (RShiftVL src shift));
 6978   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 6979   ins_encode %{
 6980     int opcode = this->ideal_Opcode();
 6981     int vlen_enc = vector_length_encoding(this);
 6982     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6983   %}
 6984   ins_pipe( pipe_slow );
 6985 %}
 6986 
 6987 // --------------------------------- AND --------------------------------------
 6988 
 6989 instruct vand(vec dst, vec src) %{
 6990   predicate(UseAVX == 0);
 6991   match(Set dst (AndV dst src));
 6992   format %{ "pand    $dst,$src\t! and vectors" %}
 6993   ins_encode %{
 6994     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 6995   %}
 6996   ins_pipe( pipe_slow );
 6997 %}
 6998 
 6999 instruct vand_reg(vec dst, vec src1, vec src2) %{
 7000   predicate(UseAVX > 0);
 7001   match(Set dst (AndV src1 src2));
 7002   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 7003   ins_encode %{
 7004     int vlen_enc = vector_length_encoding(this);
 7005     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7006   %}
 7007   ins_pipe( pipe_slow );
 7008 %}
 7009 
 7010 instruct vand_mem(vec dst, vec src, memory mem) %{
 7011   predicate((UseAVX > 0) &&
 7012             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7013   match(Set dst (AndV src (LoadVector mem)));
 7014   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 7015   ins_encode %{
 7016     int vlen_enc = vector_length_encoding(this);
 7017     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7018   %}
 7019   ins_pipe( pipe_slow );
 7020 %}
 7021 
 7022 // --------------------------------- OR ---------------------------------------
 7023 
 7024 instruct vor(vec dst, vec src) %{
 7025   predicate(UseAVX == 0);
 7026   match(Set dst (OrV dst src));
 7027   format %{ "por     $dst,$src\t! or vectors" %}
 7028   ins_encode %{
 7029     __ por($dst$$XMMRegister, $src$$XMMRegister);
 7030   %}
 7031   ins_pipe( pipe_slow );
 7032 %}
 7033 
 7034 instruct vor_reg(vec dst, vec src1, vec src2) %{
 7035   predicate(UseAVX > 0);
 7036   match(Set dst (OrV src1 src2));
 7037   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 7038   ins_encode %{
 7039     int vlen_enc = vector_length_encoding(this);
 7040     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7041   %}
 7042   ins_pipe( pipe_slow );
 7043 %}
 7044 
 7045 instruct vor_mem(vec dst, vec src, memory mem) %{
 7046   predicate((UseAVX > 0) &&
 7047             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7048   match(Set dst (OrV src (LoadVector mem)));
 7049   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 7050   ins_encode %{
 7051     int vlen_enc = vector_length_encoding(this);
 7052     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7053   %}
 7054   ins_pipe( pipe_slow );
 7055 %}
 7056 
 7057 // --------------------------------- XOR --------------------------------------
 7058 
 7059 instruct vxor(vec dst, vec src) %{
 7060   predicate(UseAVX == 0);
 7061   match(Set dst (XorV dst src));
 7062   format %{ "pxor    $dst,$src\t! xor vectors" %}
 7063   ins_encode %{
 7064     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 7065   %}
 7066   ins_pipe( pipe_slow );
 7067 %}
 7068 
 7069 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 7070   predicate(UseAVX > 0);
 7071   match(Set dst (XorV src1 src2));
 7072   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 7073   ins_encode %{
 7074     int vlen_enc = vector_length_encoding(this);
 7075     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7076   %}
 7077   ins_pipe( pipe_slow );
 7078 %}
 7079 
 7080 instruct vxor_mem(vec dst, vec src, memory mem) %{
 7081   predicate((UseAVX > 0) &&
 7082             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 7083   match(Set dst (XorV src (LoadVector mem)));
 7084   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 7085   ins_encode %{
 7086     int vlen_enc = vector_length_encoding(this);
 7087     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 7088   %}
 7089   ins_pipe( pipe_slow );
 7090 %}
 7091 
 7092 // --------------------------------- VectorCast --------------------------------------
 7093 
 7094 instruct vcastBtoX(vec dst, vec src) %{
 7095   match(Set dst (VectorCastB2X src));
 7096   format %{ "vector_cast_b2x $dst,$src\t!" %}
 7097   ins_encode %{
 7098     assert(UseAVX > 0, "required");
 7099 
 7100     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7101     int vlen_enc = vector_length_encoding(this);
 7102     __ vconvert_b2x(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7103   %}
 7104   ins_pipe( pipe_slow );
 7105 %}
 7106 
 7107 instruct castStoX(vec dst, vec src) %{
 7108   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7109             Matcher::vector_length(n->in(1)) <= 8 && // src
 7110             Matcher::vector_element_basic_type(n) == T_BYTE);
 7111   match(Set dst (VectorCastS2X src));
 7112   format %{ "vector_cast_s2x $dst,$src" %}
 7113   ins_encode %{
 7114     assert(UseAVX > 0, "required");
 7115 
 7116     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, noreg);
 7117     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 7118   %}
 7119   ins_pipe( pipe_slow );
 7120 %}
 7121 
 7122 instruct vcastStoX(vec dst, vec src, vec vtmp) %{
 7123   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 7124             Matcher::vector_length(n->in(1)) == 16 && // src
 7125             Matcher::vector_element_basic_type(n) == T_BYTE);
 7126   effect(TEMP dst, TEMP vtmp);
 7127   match(Set dst (VectorCastS2X src));
 7128   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp as TEMP" %}
 7129   ins_encode %{
 7130     assert(UseAVX > 0, "required");
 7131 
 7132     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7133     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, noreg);
 7134     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7135     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7136   %}
 7137   ins_pipe( pipe_slow );
 7138 %}
 7139 
 7140 instruct vcastStoX_evex(vec dst, vec src) %{
 7141   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7142             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7143   match(Set dst (VectorCastS2X src));
 7144   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7145   ins_encode %{
 7146     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7147     int src_vlen_enc = vector_length_encoding(this, $src);
 7148     int vlen_enc = vector_length_encoding(this);
 7149     switch (to_elem_bt) {
 7150       case T_BYTE:
 7151         if (!VM_Version::supports_avx512vl()) {
 7152           vlen_enc = Assembler::AVX_512bit;
 7153         }
 7154         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7155         break;
 7156       case T_INT:
 7157         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7158         break;
 7159       case T_FLOAT:
 7160         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7161         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7162         break;
 7163       case T_LONG:
 7164         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7165         break;
 7166       case T_DOUBLE: {
 7167         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7168         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7169         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7170         break;
 7171       }
 7172       default:
 7173         ShouldNotReachHere();
 7174     }
 7175   %}
 7176   ins_pipe( pipe_slow );
 7177 %}
 7178 
 7179 instruct castItoX(vec dst, vec src) %{
 7180   predicate(UseAVX <= 2 &&
 7181             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7182             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7183   match(Set dst (VectorCastI2X src));
 7184   format %{ "vector_cast_i2x $dst,$src" %}
 7185   ins_encode %{
 7186     assert(UseAVX > 0, "required");
 7187 
 7188     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7189     int vlen_enc = vector_length_encoding(this, $src);
 7190 
 7191     if (to_elem_bt == T_BYTE) {
 7192       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7193       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7194       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7195     } else {
 7196       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7197       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7198       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7199     }
 7200   %}
 7201   ins_pipe( pipe_slow );
 7202 %}
 7203 
 7204 instruct vcastItoX(vec dst, vec src, vec vtmp) %{
 7205   predicate(UseAVX <= 2 &&
 7206             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7207             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7208   match(Set dst (VectorCastI2X src));
 7209   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp as TEMP" %}
 7210   effect(TEMP dst, TEMP vtmp);
 7211   ins_encode %{
 7212     assert(UseAVX > 0, "required");
 7213 
 7214     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7215     int vlen_enc = vector_length_encoding(this, $src);
 7216 
 7217     if (to_elem_bt == T_BYTE) {
 7218       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, noreg);
 7219       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7220       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7221       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7222     } else {
 7223       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7224       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, noreg);
 7225       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7226       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7227     }
 7228   %}
 7229   ins_pipe( pipe_slow );
 7230 %}
 7231 
 7232 instruct vcastItoX_evex(vec dst, vec src) %{
 7233   predicate(UseAVX > 2 ||
 7234             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7235   match(Set dst (VectorCastI2X src));
 7236   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7237   ins_encode %{
 7238     assert(UseAVX > 0, "required");
 7239 
 7240     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7241     int src_vlen_enc = vector_length_encoding(this, $src);
 7242     int dst_vlen_enc = vector_length_encoding(this);
 7243     switch (dst_elem_bt) {
 7244       case T_BYTE:
 7245         if (!VM_Version::supports_avx512vl()) {
 7246           src_vlen_enc = Assembler::AVX_512bit;
 7247         }
 7248         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7249         break;
 7250       case T_SHORT:
 7251         if (!VM_Version::supports_avx512vl()) {
 7252           src_vlen_enc = Assembler::AVX_512bit;
 7253         }
 7254         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7255         break;
 7256       case T_FLOAT:
 7257         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7258         break;
 7259       case T_LONG:
 7260         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7261         break;
 7262       case T_DOUBLE:
 7263         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7264         break;
 7265       default:
 7266         ShouldNotReachHere();
 7267     }
 7268   %}
 7269   ins_pipe( pipe_slow );
 7270 %}
 7271 
 7272 instruct vcastLtoBS(vec dst, vec src) %{
 7273   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7274             UseAVX <= 2);
 7275   match(Set dst (VectorCastL2X src));
 7276   format %{ "vector_cast_l2x  $dst,$src" %}
 7277   ins_encode %{
 7278     assert(UseAVX > 0, "required");
 7279 
 7280     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7281     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7282     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7283                                                       : ExternalAddress(vector_int_to_short_mask());
 7284     if (vlen <= 16) {
 7285       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7286       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7287       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7288     } else {
 7289       assert(vlen <= 32, "required");
 7290       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7291       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7292       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, noreg);
 7293       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7294     }
 7295     if (to_elem_bt == T_BYTE) {
 7296       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7297     }
 7298   %}
 7299   ins_pipe( pipe_slow );
 7300 %}
 7301 
 7302 instruct vcastLtoX_evex(vec dst, vec src) %{
 7303   predicate(UseAVX > 2 ||
 7304             (Matcher::vector_element_basic_type(n) == T_INT ||
 7305              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7306              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7307   match(Set dst (VectorCastL2X src));
 7308   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7309   ins_encode %{
 7310     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7311     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7312     int vlen_enc = vector_length_encoding(this, $src);
 7313     switch (to_elem_bt) {
 7314       case T_BYTE:
 7315         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7316           vlen_enc = Assembler::AVX_512bit;
 7317         }
 7318         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7319         break;
 7320       case T_SHORT:
 7321         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7322           vlen_enc = Assembler::AVX_512bit;
 7323         }
 7324         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7325         break;
 7326       case T_INT:
 7327         if (vlen == 8) {
 7328           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7329             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7330           }
 7331         } else if (vlen == 16) {
 7332           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7333         } else if (vlen == 32) {
 7334           if (UseAVX > 2) {
 7335             if (!VM_Version::supports_avx512vl()) {
 7336               vlen_enc = Assembler::AVX_512bit;
 7337             }
 7338             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7339           } else {
 7340             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7341             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7342           }
 7343         } else { // vlen == 64
 7344           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7345         }
 7346         break;
 7347       case T_FLOAT:
 7348         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7349         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7350         break;
 7351       case T_DOUBLE:
 7352         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7353         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7354         break;
 7355 
 7356       default: assert(false, "%s", type2name(to_elem_bt));
 7357     }
 7358   %}
 7359   ins_pipe( pipe_slow );
 7360 %}
 7361 
 7362 instruct vcastFtoD_reg(vec dst, vec src) %{
 7363   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7364   match(Set dst (VectorCastF2X src));
 7365   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7366   ins_encode %{
 7367     int vlen_enc = vector_length_encoding(this);
 7368     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7369   %}
 7370   ins_pipe( pipe_slow );
 7371 %}
 7372 
 7373 
 7374 instruct castFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7375   // F2I conversion for < 64 byte vector using AVX instructions
 7376   // AVX512 platforms that dont support avx512vl also use AVX instructions to support F2I
 7377   predicate(!VM_Version::supports_avx512vl() &&
 7378             Matcher::vector_length_in_bytes(n) < 64 &&
 7379             Matcher::vector_element_basic_type(n) == T_INT);
 7380   match(Set dst (VectorCastF2X src));
 7381   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7382   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7383   ins_encode %{
 7384     int vlen_enc = vector_length_encoding(this);
 7385     __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister,
 7386                           ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vlen_enc,
 7387                           $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7388   %}
 7389   ins_pipe( pipe_slow );
 7390 %}
 7391 
 7392 instruct castFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7393   predicate((VM_Version::supports_avx512vl() ||
 7394              Matcher::vector_length_in_bytes(n) == 64) &&
 7395              Matcher::vector_element_basic_type(n) == T_INT);
 7396   match(Set dst (VectorCastF2X src));
 7397   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7398   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7399   ins_encode %{
 7400     int vlen_enc = vector_length_encoding(this);
 7401     __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister,
 7402                            ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vlen_enc,
 7403                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7404   %}
 7405   ins_pipe( pipe_slow );
 7406 %}
 7407 
 7408 instruct castFtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7409   // F2X conversion for integral non T_INT target using AVX512 instructions
 7410   // Platforms that dont support avx512vl can only support 64 byte vectors
 7411   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) &&
 7412             Matcher::vector_element_basic_type(n) != T_INT);
 7413   match(Set dst (VectorCastF2X src));
 7414   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7415   format %{ "vector_cast_f2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7416   ins_encode %{
 7417     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7418     if (to_elem_bt == T_LONG) {
 7419       int vlen_enc = vector_length_encoding(this);
 7420       __ vector_castF2L_evex($dst$$XMMRegister, $src$$XMMRegister,
 7421                              ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vlen_enc,
 7422                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7423     } else {
 7424       int vlen_enc = vector_length_encoding(this, $src);
 7425       __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister,
 7426                              ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vlen_enc,
 7427                              $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7428       if (to_elem_bt == T_SHORT) {
 7429         __ evpmovdw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7430       } else {
 7431         assert(to_elem_bt == T_BYTE, "required");
 7432         __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7433       }
 7434     }
 7435   %}
 7436   ins_pipe( pipe_slow );
 7437 %}
 7438 
 7439 instruct vcastDtoF_reg(vec dst, vec src) %{
 7440   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7441   match(Set dst (VectorCastD2X src));
 7442   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7443   ins_encode %{
 7444     int vlen_enc = vector_length_encoding(this, $src);
 7445     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7446   %}
 7447   ins_pipe( pipe_slow );
 7448 %}
 7449 
 7450 instruct castDtoX_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7451   predicate(is_integral_type(Matcher::vector_element_basic_type(n)));
 7452   match(Set dst (VectorCastD2X src));
 7453   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7454   format %{ "vector_cast_d2x $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7455   ins_encode %{
 7456     int vlen_enc = vector_length_encoding(this, $src);
 7457     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7458     __ vector_castD2X_evex(to_elem_bt, $dst$$XMMRegister, $src$$XMMRegister,
 7459                            ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vlen_enc,
 7460                            $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7461   %}
 7462   ins_pipe( pipe_slow );
 7463 %}
 7464 
 7465 instruct vucast(vec dst, vec src) %{
 7466   match(Set dst (VectorUCastB2X src));
 7467   match(Set dst (VectorUCastS2X src));
 7468   match(Set dst (VectorUCastI2X src));
 7469   format %{ "vector_ucast $dst,$src\t!" %}
 7470   ins_encode %{
 7471     assert(UseAVX > 0, "required");
 7472 
 7473     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7474     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7475     int vlen_enc = vector_length_encoding(this);
 7476     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7477   %}
 7478   ins_pipe( pipe_slow );
 7479 %}
 7480 
 7481 #ifdef _LP64
 7482 instruct vround_float_avx(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rFlagsReg cr) %{
 7483   predicate(!VM_Version::supports_avx512vl() &&
 7484             Matcher::vector_length_in_bytes(n) < 64 &&
 7485             Matcher::vector_element_basic_type(n) == T_INT);
 7486   match(Set dst (RoundVF src));
 7487   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, KILL cr);
 7488   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $xtmp3, $xtmp4 as TEMP" %}
 7489   ins_encode %{
 7490     int vlen_enc = vector_length_encoding(this);
 7491     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7492     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister,
 7493                               ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7494                               $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister);
 7495   %}
 7496   ins_pipe( pipe_slow );
 7497 %}
 7498 
 7499 instruct vround_float_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7500   predicate((VM_Version::supports_avx512vl() ||
 7501              Matcher::vector_length_in_bytes(n) == 64) &&
 7502              Matcher::vector_element_basic_type(n) == T_INT);
 7503   match(Set dst (RoundVF src));
 7504   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, KILL cr);
 7505   format %{ "vector_round_float $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7506   ins_encode %{
 7507     int vlen_enc = vector_length_encoding(this);
 7508     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7509     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister,
 7510                                ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), new_mxcsr, vlen_enc,
 7511                                $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7512   %}
 7513   ins_pipe( pipe_slow );
 7514 %}
 7515 
 7516 instruct vround_reg_evex(vec dst, vec src, rRegP tmp, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 7517   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7518   match(Set dst (RoundVD src));
 7519   effect(TEMP dst, TEMP tmp, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2,  KILL cr);
 7520   format %{ "vector_round_long $dst,$src\t! using $tmp, $xtmp1, $xtmp2, $ktmp1, $ktmp2 as TEMP" %}
 7521   ins_encode %{
 7522     int vlen_enc = vector_length_encoding(this);
 7523     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7524     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister,
 7525                                 ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), new_mxcsr, vlen_enc,
 7526                                 $tmp$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister);
 7527   %}
 7528   ins_pipe( pipe_slow );
 7529 %}
 7530 
 7531 #endif // _LP64
 7532 
 7533 // --------------------------------- VectorMaskCmp --------------------------------------
 7534 
 7535 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7536   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7537             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7538             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7539             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7540   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7541   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7542   ins_encode %{
 7543     int vlen_enc = vector_length_encoding(this, $src1);
 7544     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7545     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7546       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7547     } else {
 7548       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7549     }
 7550   %}
 7551   ins_pipe( pipe_slow );
 7552 %}
 7553 
 7554 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7555   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7556             n->bottom_type()->isa_vectmask() == NULL &&
 7557             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7558   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7559   effect(TEMP ktmp);
 7560   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7561   ins_encode %{
 7562     int vlen_enc = Assembler::AVX_512bit;
 7563     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7564     KRegister mask = k0; // The comparison itself is not being masked.
 7565     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7566       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7567       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7568     } else {
 7569       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7570       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, noreg);
 7571     }
 7572   %}
 7573   ins_pipe( pipe_slow );
 7574 %}
 7575 
 7576 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7577   predicate(n->bottom_type()->isa_vectmask() &&
 7578             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7579   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7580   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7581   ins_encode %{
 7582     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7583     int vlen_enc = vector_length_encoding(this, $src1);
 7584     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7585     KRegister mask = k0; // The comparison itself is not being masked.
 7586     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7587       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7588     } else {
 7589       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7590     }
 7591   %}
 7592   ins_pipe( pipe_slow );
 7593 %}
 7594 
 7595 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7596   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7597             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7598             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7599             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7600             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7601             (n->in(2)->get_int() == BoolTest::eq ||
 7602              n->in(2)->get_int() == BoolTest::lt ||
 7603              n->in(2)->get_int() == BoolTest::gt)); // cond
 7604   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7605   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7606   ins_encode %{
 7607     int vlen_enc = vector_length_encoding(this, $src1);
 7608     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7609     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7610     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7611   %}
 7612   ins_pipe( pipe_slow );
 7613 %}
 7614 
 7615 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7616   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7617             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7618             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7619             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7620             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7621             (n->in(2)->get_int() == BoolTest::ne ||
 7622              n->in(2)->get_int() == BoolTest::le ||
 7623              n->in(2)->get_int() == BoolTest::ge)); // cond
 7624   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7625   effect(TEMP dst, TEMP xtmp);
 7626   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7627   ins_encode %{
 7628     int vlen_enc = vector_length_encoding(this, $src1);
 7629     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7630     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7631     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7632   %}
 7633   ins_pipe( pipe_slow );
 7634 %}
 7635 
 7636 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7637   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7638             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7639             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7640             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7641             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7642   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7643   effect(TEMP dst, TEMP xtmp);
 7644   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7645   ins_encode %{
 7646     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7647     int vlen_enc = vector_length_encoding(this, $src1);
 7648     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7649     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7650 
 7651     if (vlen_enc == Assembler::AVX_128bit) {
 7652       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7653     } else {
 7654       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7655     }
 7656     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7657     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7658     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7659   %}
 7660   ins_pipe( pipe_slow );
 7661 %}
 7662 
 7663 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, kReg ktmp) %{
 7664   predicate((n->bottom_type()->isa_vectmask() == NULL &&
 7665              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7666              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7667   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7668   effect(TEMP ktmp);
 7669   format %{ "vector_compare $dst,$src1,$src2,$cond" %}
 7670   ins_encode %{
 7671     assert(UseAVX > 2, "required");
 7672 
 7673     int vlen_enc = vector_length_encoding(this, $src1);
 7674     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7675     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7676     KRegister mask = k0; // The comparison itself is not being masked.
 7677     bool merge = false;
 7678     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7679 
 7680     switch (src1_elem_bt) {
 7681       case T_INT: {
 7682         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7683         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7684         break;
 7685       }
 7686       case T_LONG: {
 7687         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7688         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, noreg);
 7689         break;
 7690       }
 7691       default: assert(false, "%s", type2name(src1_elem_bt));
 7692     }
 7693   %}
 7694   ins_pipe( pipe_slow );
 7695 %}
 7696 
 7697 
 7698 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7699   predicate(n->bottom_type()->isa_vectmask() &&
 7700             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7701   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7702   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7703   ins_encode %{
 7704     assert(UseAVX > 2, "required");
 7705     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7706 
 7707     int vlen_enc = vector_length_encoding(this, $src1);
 7708     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7709     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7710     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7711 
 7712     // Comparison i
 7713     switch (src1_elem_bt) {
 7714       case T_BYTE: {
 7715         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7716         break;
 7717       }
 7718       case T_SHORT: {
 7719         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7720         break;
 7721       }
 7722       case T_INT: {
 7723         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7724         break;
 7725       }
 7726       case T_LONG: {
 7727         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7728         break;
 7729       }
 7730       default: assert(false, "%s", type2name(src1_elem_bt));
 7731     }
 7732   %}
 7733   ins_pipe( pipe_slow );
 7734 %}
 7735 
 7736 // Extract
 7737 
 7738 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7739   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7740   match(Set dst (ExtractI src idx));
 7741   match(Set dst (ExtractS src idx));
 7742 #ifdef _LP64
 7743   match(Set dst (ExtractB src idx));
 7744 #endif
 7745   format %{ "extractI $dst,$src,$idx\t!" %}
 7746   ins_encode %{
 7747     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7748 
 7749     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7750     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7751   %}
 7752   ins_pipe( pipe_slow );
 7753 %}
 7754 
 7755 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7756   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7757             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7758   match(Set dst (ExtractI src idx));
 7759   match(Set dst (ExtractS src idx));
 7760 #ifdef _LP64
 7761   match(Set dst (ExtractB src idx));
 7762 #endif
 7763   effect(TEMP vtmp);
 7764   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7765   ins_encode %{
 7766     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7767 
 7768     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7769     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7770     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7771   %}
 7772   ins_pipe( pipe_slow );
 7773 %}
 7774 
 7775 #ifdef _LP64
 7776 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7777   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7778   match(Set dst (ExtractL src idx));
 7779   format %{ "extractL $dst,$src,$idx\t!" %}
 7780   ins_encode %{
 7781     assert(UseSSE >= 4, "required");
 7782     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7783 
 7784     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7785   %}
 7786   ins_pipe( pipe_slow );
 7787 %}
 7788 
 7789 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7790   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7791             Matcher::vector_length(n->in(1)) == 8);  // src
 7792   match(Set dst (ExtractL src idx));
 7793   effect(TEMP vtmp);
 7794   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7795   ins_encode %{
 7796     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7797 
 7798     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7799     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7800   %}
 7801   ins_pipe( pipe_slow );
 7802 %}
 7803 #endif
 7804 
 7805 instruct extractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7806   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7807   match(Set dst (ExtractF src idx));
 7808   effect(TEMP dst, TEMP vtmp);
 7809   format %{ "extractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7810   ins_encode %{
 7811     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7812 
 7813     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $vtmp$$XMMRegister);
 7814   %}
 7815   ins_pipe( pipe_slow );
 7816 %}
 7817 
 7818 instruct vextractF(legRegF dst, legVec src, immU8 idx, legVec vtmp) %{
 7819   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7820             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7821   match(Set dst (ExtractF src idx));
 7822   effect(TEMP vtmp);
 7823   format %{ "vextractF $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7824   ins_encode %{
 7825     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7826 
 7827     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7828     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7829   %}
 7830   ins_pipe( pipe_slow );
 7831 %}
 7832 
 7833 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7834   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7835   match(Set dst (ExtractD src idx));
 7836   format %{ "extractD $dst,$src,$idx\t!" %}
 7837   ins_encode %{
 7838     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7839 
 7840     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7841   %}
 7842   ins_pipe( pipe_slow );
 7843 %}
 7844 
 7845 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7846   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7847             Matcher::vector_length(n->in(1)) == 8);  // src
 7848   match(Set dst (ExtractD src idx));
 7849   effect(TEMP vtmp);
 7850   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7851   ins_encode %{
 7852     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7853 
 7854     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7855     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7856   %}
 7857   ins_pipe( pipe_slow );
 7858 %}
 7859 
 7860 // --------------------------------- Vector Blend --------------------------------------
 7861 
 7862 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7863   predicate(UseAVX == 0);
 7864   match(Set dst (VectorBlend (Binary dst src) mask));
 7865   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7866   effect(TEMP tmp);
 7867   ins_encode %{
 7868     assert(UseSSE >= 4, "required");
 7869 
 7870     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7871       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7872     }
 7873     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7874   %}
 7875   ins_pipe( pipe_slow );
 7876 %}
 7877 
 7878 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7879   predicate(UseAVX > 0 &&
 7880             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7881             Matcher::vector_length_in_bytes(n) <= 32 &&
 7882             is_integral_type(Matcher::vector_element_basic_type(n)));
 7883   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7884   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7885   ins_encode %{
 7886     int vlen_enc = vector_length_encoding(this);
 7887     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7888   %}
 7889   ins_pipe( pipe_slow );
 7890 %}
 7891 
 7892 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7893   predicate(UseAVX > 0 &&
 7894             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7895             Matcher::vector_length_in_bytes(n) <= 32 &&
 7896             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7897   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7898   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7899   ins_encode %{
 7900     int vlen_enc = vector_length_encoding(this);
 7901     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7902   %}
 7903   ins_pipe( pipe_slow );
 7904 %}
 7905 
 7906 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, kReg ktmp) %{
 7907   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7908             n->in(2)->bottom_type()->isa_vectmask() == NULL);
 7909   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7910   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7911   effect(TEMP ktmp);
 7912   ins_encode %{
 7913      int vlen_enc = Assembler::AVX_512bit;
 7914      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7915     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, noreg);
 7916     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7917   %}
 7918   ins_pipe( pipe_slow );
 7919 %}
 7920 
 7921 
 7922 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask) %{
 7923   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7924             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7925              VM_Version::supports_avx512bw()));
 7926   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7927   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using k2 as TEMP" %}
 7928   ins_encode %{
 7929     int vlen_enc = vector_length_encoding(this);
 7930     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7931     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7932   %}
 7933   ins_pipe( pipe_slow );
 7934 %}
 7935 
 7936 // --------------------------------- ABS --------------------------------------
 7937 // a = |a|
 7938 instruct vabsB_reg(vec dst, vec src) %{
 7939   match(Set dst (AbsVB  src));
 7940   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7941   ins_encode %{
 7942     uint vlen = Matcher::vector_length(this);
 7943     if (vlen <= 16) {
 7944       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7945     } else {
 7946       int vlen_enc = vector_length_encoding(this);
 7947       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7948     }
 7949   %}
 7950   ins_pipe( pipe_slow );
 7951 %}
 7952 
 7953 instruct vabsS_reg(vec dst, vec src) %{
 7954   match(Set dst (AbsVS  src));
 7955   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7956   ins_encode %{
 7957     uint vlen = Matcher::vector_length(this);
 7958     if (vlen <= 8) {
 7959       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7960     } else {
 7961       int vlen_enc = vector_length_encoding(this);
 7962       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7963     }
 7964   %}
 7965   ins_pipe( pipe_slow );
 7966 %}
 7967 
 7968 instruct vabsI_reg(vec dst, vec src) %{
 7969   match(Set dst (AbsVI  src));
 7970   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 7971   ins_encode %{
 7972     uint vlen = Matcher::vector_length(this);
 7973     if (vlen <= 4) {
 7974       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 7975     } else {
 7976       int vlen_enc = vector_length_encoding(this);
 7977       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7978     }
 7979   %}
 7980   ins_pipe( pipe_slow );
 7981 %}
 7982 
 7983 instruct vabsL_reg(vec dst, vec src) %{
 7984   match(Set dst (AbsVL  src));
 7985   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 7986   ins_encode %{
 7987     assert(UseAVX > 2, "required");
 7988     int vlen_enc = vector_length_encoding(this);
 7989     if (!VM_Version::supports_avx512vl()) {
 7990       vlen_enc = Assembler::AVX_512bit;
 7991     }
 7992     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7993   %}
 7994   ins_pipe( pipe_slow );
 7995 %}
 7996 
 7997 // --------------------------------- ABSNEG --------------------------------------
 7998 
 7999 instruct vabsnegF(vec dst, vec src) %{
 8000   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 8001   match(Set dst (AbsVF src));
 8002   match(Set dst (NegVF src));
 8003   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 8004   ins_cost(150);
 8005   ins_encode %{
 8006     int opcode = this->ideal_Opcode();
 8007     int vlen = Matcher::vector_length(this);
 8008     if (vlen == 2) {
 8009       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8010     } else {
 8011       assert(vlen == 8 || vlen == 16, "required");
 8012       int vlen_enc = vector_length_encoding(this);
 8013       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8014     }
 8015   %}
 8016   ins_pipe( pipe_slow );
 8017 %}
 8018 
 8019 instruct vabsneg4F(vec dst) %{
 8020   predicate(Matcher::vector_length(n) == 4);
 8021   match(Set dst (AbsVF dst));
 8022   match(Set dst (NegVF dst));
 8023   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 8024   ins_cost(150);
 8025   ins_encode %{
 8026     int opcode = this->ideal_Opcode();
 8027     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister);
 8028   %}
 8029   ins_pipe( pipe_slow );
 8030 %}
 8031 
 8032 instruct vabsnegD(vec dst, vec src) %{
 8033   match(Set dst (AbsVD  src));
 8034   match(Set dst (NegVD  src));
 8035   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 8036   ins_encode %{
 8037     int opcode = this->ideal_Opcode();
 8038     uint vlen = Matcher::vector_length(this);
 8039     if (vlen == 2) {
 8040       assert(UseSSE >= 2, "required");
 8041       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister);
 8042     } else {
 8043       int vlen_enc = vector_length_encoding(this);
 8044       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8045     }
 8046   %}
 8047   ins_pipe( pipe_slow );
 8048 %}
 8049 
 8050 //------------------------------------- VectorTest --------------------------------------------
 8051 
 8052 #ifdef _LP64
 8053 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
 8054   predicate(!VM_Version::supports_avx512bwdq() &&
 8055             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
 8056             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
 8057             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8058   match(Set dst (VectorTest src1 src2 ));
 8059   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
 8060   format %{ "vptest_alltrue_lt16 $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
 8061   ins_encode %{
 8062     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8063     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 8064     __ setb(Assembler::carrySet, $dst$$Register);
 8065     __ movzbl($dst$$Register, $dst$$Register);
 8066   %}
 8067   ins_pipe( pipe_slow );
 8068 %}
 8069 
 8070 instruct vptest_alltrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
 8071   predicate(!VM_Version::supports_avx512bwdq() &&
 8072             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
 8073             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
 8074             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 8075   match(Set dst (VectorTest src1 src2 ));
 8076   effect(KILL cr);
 8077   format %{ "vptest_alltrue_ge16  $dst,$src1, $src2\t! using $cr as TEMP" %}
 8078   ins_encode %{
 8079     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8080     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
 8081     __ setb(Assembler::carrySet, $dst$$Register);
 8082     __ movzbl($dst$$Register, $dst$$Register);
 8083   %}
 8084   ins_pipe( pipe_slow );
 8085 %}
 8086 
 8087 instruct vptest_alltrue_lt8_evex(rRegI dst, kReg src1, kReg src2, kReg kscratch, rFlagsReg cr) %{
 8088   predicate(VM_Version::supports_avx512bwdq() &&
 8089             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
 8090             n->in(1)->bottom_type()->isa_vectmask() &&
 8091             Matcher::vector_length(n->in(1)) < 8);
 8092   match(Set dst (VectorTest src1 src2));
 8093   effect(KILL cr, TEMP kscratch);
 8094   format %{ "vptest_alltrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
 8095   ins_encode %{
 8096     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 8097     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 8098     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 8099     uint masklen = Matcher::vector_length(this, $src1);
 8100     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, $kscratch$$KRegister);
 8101   %}
 8102   ins_pipe( pipe_slow );
 8103 %}
 8104 
 8105 
 8106 instruct vptest_alltrue_ge8_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
 8107   predicate(VM_Version::supports_avx512bwdq() &&
 8108             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
 8109             n->in(1)->bottom_type()->isa_vectmask() &&
 8110             Matcher::vector_length(n->in(1)) >= 8);
 8111   match(Set dst (VectorTest src1 src2));
 8112   effect(KILL cr);
 8113   format %{ "vptest_alltrue_ge8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
 8114   ins_encode %{
 8115     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 8116     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 8117     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 8118     uint masklen = Matcher::vector_length(this, $src1);
 8119     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, knoreg);
 8120   %}
 8121   ins_pipe( pipe_slow );
 8122 %}
 8123 
 8124 
 8125 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
 8126   predicate(!VM_Version::supports_avx512bwdq() &&
 8127             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
 8128             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
 8129             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8130   match(Set dst (VectorTest src1 src2 ));
 8131   effect(TEMP vtmp, KILL cr);
 8132   format %{ "vptest_anytrue_lt16 $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
 8133   ins_encode %{
 8134     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8135     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 8136     __ setb(Assembler::notZero, $dst$$Register);
 8137     __ movzbl($dst$$Register, $dst$$Register);
 8138   %}
 8139   ins_pipe( pipe_slow );
 8140 %}
 8141 
 8142 instruct vptest_anytrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
 8143   predicate(!VM_Version::supports_avx512bwdq() &&
 8144             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
 8145             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
 8146             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8147   match(Set dst (VectorTest src1 src2 ));
 8148   effect(KILL cr);
 8149   format %{ "vptest_anytrue_ge16 $dst,$src1,$src2\t! using $cr as TEMP" %}
 8150   ins_encode %{
 8151     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8152     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
 8153     __ setb(Assembler::notZero, $dst$$Register);
 8154     __ movzbl($dst$$Register, $dst$$Register);
 8155   %}
 8156   ins_pipe( pipe_slow );
 8157 %}
 8158 
 8159 instruct vptest_anytrue_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
 8160   predicate(VM_Version::supports_avx512bwdq() &&
 8161             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8162   match(Set dst (VectorTest src1 src2));
 8163   effect(KILL cr);
 8164   format %{ "vptest_anytrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
 8165   ins_encode %{
 8166     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 8167     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 8168     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 8169     uint  masklen = Matcher::vector_length(this, $src1);
 8170     __ anytrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister);
 8171   %}
 8172   ins_pipe( pipe_slow );
 8173 %}
 8174 
 8175 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
 8176   predicate(!VM_Version::supports_avx512bwdq() &&
 8177             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
 8178             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
 8179             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
 8180   match(Set cr (CmpI (VectorTest src1 src2) zero));
 8181   effect(TEMP vtmp);
 8182   format %{ "cmpvptest_anytrue_lt16 $src1,$src2\t! using $vtmp as TEMP" %}
 8183   ins_encode %{
 8184     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8185     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 8186   %}
 8187   ins_pipe( pipe_slow );
 8188 %}
 8189 
 8190 instruct cmpvptest_anytrue_ge16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
 8191   predicate(!VM_Version::supports_avx512bwdq() &&
 8192             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
 8193             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
 8194             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
 8195   match(Set cr (CmpI (VectorTest src1 src2) zero));
 8196   format %{ "cmpvptest_anytrue_ge16 $src1,$src2\t!" %}
 8197   ins_encode %{
 8198     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8199     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
 8200   %}
 8201   ins_pipe( pipe_slow );
 8202 %}
 8203 
 8204 instruct cmpvptest_anytrue_evex(rFlagsReg cr, kReg src1, kReg src2, immI_0 zero) %{
 8205   predicate(VM_Version::supports_avx512bwdq() &&
 8206             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
 8207   match(Set cr (CmpI (VectorTest src1 src2) zero));
 8208   format %{ "cmpvptest_anytrue_evex $src1,$src2\t!" %}
 8209   ins_encode %{
 8210     uint masklen = Matcher::vector_length(this, $src1);
 8211     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 8212     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 8213     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 8214     masklen = masklen < 8 ? 8 : masklen;
 8215     __ ktest(masklen, $src1$$KRegister, $src2$$KRegister);
 8216   %}
 8217   ins_pipe( pipe_slow );
 8218 %}
 8219 #endif
 8220 
 8221 //------------------------------------- LoadMask --------------------------------------------
 8222 
 8223 instruct loadMask(legVec dst, legVec src) %{
 8224   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
 8225   match(Set dst (VectorLoadMask src));
 8226   effect(TEMP dst);
 8227   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8228   ins_encode %{
 8229     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8230     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8231     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8232   %}
 8233   ins_pipe( pipe_slow );
 8234 %}
 8235 
 8236 instruct loadMask64(kReg dst, vec src, vec xtmp) %{
 8237   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8238   match(Set dst (VectorLoadMask src));
 8239   effect(TEMP xtmp);
 8240   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp as TEMP" %}
 8241   ins_encode %{
 8242     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8243                         true, Assembler::AVX_512bit);
 8244   %}
 8245   ins_pipe( pipe_slow );
 8246 %}
 8247 
 8248 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8249   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8250   match(Set dst (VectorLoadMask src));
 8251   effect(TEMP xtmp);
 8252   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8253   ins_encode %{
 8254     int vlen_enc = vector_length_encoding(in(1));
 8255     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8256                         false, vlen_enc);
 8257   %}
 8258   ins_pipe( pipe_slow );
 8259 %}
 8260 
 8261 //------------------------------------- StoreMask --------------------------------------------
 8262 
 8263 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8264   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8265   match(Set dst (VectorStoreMask src size));
 8266   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8267   ins_encode %{
 8268     int vlen = Matcher::vector_length(this);
 8269     if (vlen <= 16 && UseAVX <= 2) {
 8270       assert(UseSSE >= 3, "required");
 8271       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8272     } else {
 8273       assert(UseAVX > 0, "required");
 8274       int src_vlen_enc = vector_length_encoding(this, $src);
 8275       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8276     }
 8277   %}
 8278   ins_pipe( pipe_slow );
 8279 %}
 8280 
 8281 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8282   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8283   match(Set dst (VectorStoreMask src size));
 8284   effect(TEMP_DEF dst, TEMP xtmp);
 8285   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8286   ins_encode %{
 8287     int vlen_enc = Assembler::AVX_128bit;
 8288     int vlen = Matcher::vector_length(this);
 8289     if (vlen <= 8) {
 8290       assert(UseSSE >= 3, "required");
 8291       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8292       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8293       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8294     } else {
 8295       assert(UseAVX > 0, "required");
 8296       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8297       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8298       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8299     }
 8300   %}
 8301   ins_pipe( pipe_slow );
 8302 %}
 8303 
 8304 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8305   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8306   match(Set dst (VectorStoreMask src size));
 8307   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8308   effect(TEMP_DEF dst, TEMP xtmp);
 8309   ins_encode %{
 8310     int vlen_enc = Assembler::AVX_128bit;
 8311     int vlen = Matcher::vector_length(this);
 8312     if (vlen <= 4) {
 8313       assert(UseSSE >= 3, "required");
 8314       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8315       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8316       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8317       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8318     } else {
 8319       assert(UseAVX > 0, "required");
 8320       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8321       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8322       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8323       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8324       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8325     }
 8326   %}
 8327   ins_pipe( pipe_slow );
 8328 %}
 8329 
 8330 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8331   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8332   match(Set dst (VectorStoreMask src size));
 8333   effect(TEMP_DEF dst, TEMP xtmp);
 8334   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8335   ins_encode %{
 8336     assert(UseSSE >= 3, "required");
 8337     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8338     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8339     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8340     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8341     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8342   %}
 8343   ins_pipe( pipe_slow );
 8344 %}
 8345 
 8346 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8347   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8348   match(Set dst (VectorStoreMask src size));
 8349   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8350   effect(TEMP_DEF dst, TEMP vtmp);
 8351   ins_encode %{
 8352     int vlen_enc = Assembler::AVX_128bit;
 8353     __ vshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8354     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8355     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8356     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8357     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8358     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8359     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8360   %}
 8361   ins_pipe( pipe_slow );
 8362 %}
 8363 
 8364 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8365   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8366   match(Set dst (VectorStoreMask src size));
 8367   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8368   ins_encode %{
 8369     int src_vlen_enc = vector_length_encoding(this, $src);
 8370     int dst_vlen_enc = vector_length_encoding(this);
 8371     if (!VM_Version::supports_avx512vl()) {
 8372       src_vlen_enc = Assembler::AVX_512bit;
 8373     }
 8374     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8375     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8376   %}
 8377   ins_pipe( pipe_slow );
 8378 %}
 8379 
 8380 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8381   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8382   match(Set dst (VectorStoreMask src size));
 8383   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8384   ins_encode %{
 8385     int src_vlen_enc = vector_length_encoding(this, $src);
 8386     int dst_vlen_enc = vector_length_encoding(this);
 8387     if (!VM_Version::supports_avx512vl()) {
 8388       src_vlen_enc = Assembler::AVX_512bit;
 8389     }
 8390     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8391     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8392   %}
 8393   ins_pipe( pipe_slow );
 8394 %}
 8395 
 8396 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size) %{
 8397   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8398   match(Set dst (VectorStoreMask mask size));
 8399   effect(TEMP_DEF dst);
 8400   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8401   ins_encode %{
 8402     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8403     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8404                  false, Assembler::AVX_512bit, noreg);
 8405     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8406   %}
 8407   ins_pipe( pipe_slow );
 8408 %}
 8409 
 8410 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8411   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8412   match(Set dst (VectorStoreMask mask size));
 8413   effect(TEMP_DEF dst);
 8414   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8415   ins_encode %{
 8416     int dst_vlen_enc = vector_length_encoding(this);
 8417     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8418     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8419   %}
 8420   ins_pipe( pipe_slow );
 8421 %}
 8422 
 8423 instruct vmaskcast_evex(kReg dst) %{
 8424   predicate(Matcher::vector_length(n) == Matcher::vector_length(n->in(1)));
 8425   match(Set dst (VectorMaskCast dst));
 8426   ins_cost(0);
 8427   format %{ "vector_mask_cast $dst" %}
 8428   ins_encode %{
 8429     // empty
 8430   %}
 8431   ins_pipe(empty);
 8432 %}
 8433 
 8434 instruct vmaskcast(vec dst) %{
 8435   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
 8436             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
 8437   match(Set dst (VectorMaskCast dst));
 8438   ins_cost(0);
 8439   format %{ "vector_mask_cast $dst" %}
 8440   ins_encode %{
 8441     // empty
 8442   %}
 8443   ins_pipe(empty);
 8444 %}
 8445 
 8446 //-------------------------------- Load Iota Indices ----------------------------------
 8447 
 8448 instruct loadIotaIndices(vec dst, immI_0 src) %{
 8449   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8450   match(Set dst (VectorLoadConst src));
 8451   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8452   ins_encode %{
 8453      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8454      __ load_iota_indices($dst$$XMMRegister, vlen_in_bytes);
 8455   %}
 8456   ins_pipe( pipe_slow );
 8457 %}
 8458 
 8459 #ifdef _LP64
 8460 instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
 8461   match(Set dst (PopulateIndex src1 src2));
 8462   effect(TEMP dst, TEMP vtmp);
 8463   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8464   ins_encode %{
 8465      assert($src2$$constant == 1, "required");
 8466      int vlen = Matcher::vector_length(this);
 8467      int vlen_enc = vector_length_encoding(this);
 8468      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8469      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8470      __ load_iota_indices($dst$$XMMRegister, vlen);
 8471      if (elem_bt != T_BYTE) {
 8472        __ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8473      }
 8474      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8475   %}
 8476   ins_pipe( pipe_slow );
 8477 %}
 8478 
 8479 instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
 8480   match(Set dst (PopulateIndex src1 src2));
 8481   effect(TEMP dst, TEMP vtmp);
 8482   format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
 8483   ins_encode %{
 8484      assert($src2$$constant == 1, "required");
 8485      int vlen = Matcher::vector_length(this);
 8486      int vlen_enc = vector_length_encoding(this);
 8487      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8488      __ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
 8489      __ load_iota_indices($dst$$XMMRegister, vlen);
 8490      if (elem_bt != T_BYTE) {
 8491        __ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8492      }
 8493      __ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8494   %}
 8495   ins_pipe( pipe_slow );
 8496 %}
 8497 #endif
 8498 //-------------------------------- Rearrange ----------------------------------
 8499 
 8500 // LoadShuffle/Rearrange for Byte
 8501 
 8502 instruct loadShuffleB(vec dst) %{
 8503   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8504   match(Set dst (VectorLoadShuffle dst));
 8505   format %{ "vector_load_shuffle $dst, $dst" %}
 8506   ins_encode %{
 8507     // empty
 8508   %}
 8509   ins_pipe( pipe_slow );
 8510 %}
 8511 
 8512 instruct rearrangeB(vec dst, vec shuffle) %{
 8513   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8514             Matcher::vector_length(n) < 32);
 8515   match(Set dst (VectorRearrange dst shuffle));
 8516   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8517   ins_encode %{
 8518     assert(UseSSE >= 4, "required");
 8519     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8520   %}
 8521   ins_pipe( pipe_slow );
 8522 %}
 8523 
 8524 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8525   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8526             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8527   match(Set dst (VectorRearrange src shuffle));
 8528   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8529   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8530   ins_encode %{
 8531     assert(UseAVX >= 2, "required");
 8532     // Swap src into vtmp1
 8533     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8534     // Shuffle swapped src to get entries from other 128 bit lane
 8535     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8536     // Shuffle original src to get entries from self 128 bit lane
 8537     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8538     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8539     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8540     // Perform the blend
 8541     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8542   %}
 8543   ins_pipe( pipe_slow );
 8544 %}
 8545 
 8546 
 8547 instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
 8548   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8549             Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
 8550   match(Set dst (VectorRearrange src shuffle));
 8551   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 8552   format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
 8553   ins_encode %{
 8554     int vlen_enc = vector_length_encoding(this);
 8555     __ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
 8556                        $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
 8557                        $rtmp$$Register, $ktmp$$KRegister, vlen_enc);
 8558   %}
 8559   ins_pipe( pipe_slow );
 8560 %}
 8561 
 8562 instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
 8563   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8564             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8565   match(Set dst (VectorRearrange src shuffle));
 8566   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8567   ins_encode %{
 8568     int vlen_enc = vector_length_encoding(this);
 8569     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8570   %}
 8571   ins_pipe( pipe_slow );
 8572 %}
 8573 
 8574 // LoadShuffle/Rearrange for Short
 8575 
 8576 instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
 8577   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8578             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8579   match(Set dst (VectorLoadShuffle src));
 8580   effect(TEMP dst, TEMP vtmp);
 8581   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8582   ins_encode %{
 8583     // Create a byte shuffle mask from short shuffle mask
 8584     // only byte shuffle instruction available on these platforms
 8585     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8586     if (UseAVX == 0) {
 8587       assert(vlen_in_bytes <= 16, "required");
 8588       // Multiply each shuffle by two to get byte index
 8589       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8590       __ psllw($vtmp$$XMMRegister, 1);
 8591 
 8592       // Duplicate to create 2 copies of byte index
 8593       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8594       __ psllw($dst$$XMMRegister, 8);
 8595       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8596 
 8597       // Add one to get alternate byte index
 8598       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), noreg);
 8599       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8600     } else {
 8601       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8602       int vlen_enc = vector_length_encoding(this);
 8603       // Multiply each shuffle by two to get byte index
 8604       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8605       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8606 
 8607       // Duplicate to create 2 copies of byte index
 8608       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8609       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8610 
 8611       // Add one to get alternate byte index
 8612       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, noreg);
 8613     }
 8614   %}
 8615   ins_pipe( pipe_slow );
 8616 %}
 8617 
 8618 instruct rearrangeS(vec dst, vec shuffle) %{
 8619   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8620             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8621   match(Set dst (VectorRearrange dst shuffle));
 8622   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8623   ins_encode %{
 8624     assert(UseSSE >= 4, "required");
 8625     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8626   %}
 8627   ins_pipe( pipe_slow );
 8628 %}
 8629 
 8630 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2) %{
 8631   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8632             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8633   match(Set dst (VectorRearrange src shuffle));
 8634   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 8635   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2 as TEMP" %}
 8636   ins_encode %{
 8637     assert(UseAVX >= 2, "required");
 8638     // Swap src into vtmp1
 8639     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8640     // Shuffle swapped src to get entries from other 128 bit lane
 8641     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8642     // Shuffle original src to get entries from self 128 bit lane
 8643     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8644     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8645     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, noreg);
 8646     // Perform the blend
 8647     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8648   %}
 8649   ins_pipe( pipe_slow );
 8650 %}
 8651 
 8652 instruct loadShuffleS_evex(vec dst, vec src) %{
 8653   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8654             VM_Version::supports_avx512bw());
 8655   match(Set dst (VectorLoadShuffle src));
 8656   format %{ "vector_load_shuffle $dst, $src" %}
 8657   ins_encode %{
 8658     int vlen_enc = vector_length_encoding(this);
 8659     if (!VM_Version::supports_avx512vl()) {
 8660       vlen_enc = Assembler::AVX_512bit;
 8661     }
 8662     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8663   %}
 8664   ins_pipe( pipe_slow );
 8665 %}
 8666 
 8667 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8668   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8669             VM_Version::supports_avx512bw());
 8670   match(Set dst (VectorRearrange src shuffle));
 8671   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8672   ins_encode %{
 8673     int vlen_enc = vector_length_encoding(this);
 8674     if (!VM_Version::supports_avx512vl()) {
 8675       vlen_enc = Assembler::AVX_512bit;
 8676     }
 8677     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8678   %}
 8679   ins_pipe( pipe_slow );
 8680 %}
 8681 
 8682 // LoadShuffle/Rearrange for Integer and Float
 8683 
 8684 instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
 8685   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8686             Matcher::vector_length(n) == 4 && UseAVX < 2);
 8687   match(Set dst (VectorLoadShuffle src));
 8688   effect(TEMP dst, TEMP vtmp);
 8689   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8690   ins_encode %{
 8691     assert(UseSSE >= 4, "required");
 8692 
 8693     // Create a byte shuffle mask from int shuffle mask
 8694     // only byte shuffle instruction available on these platforms
 8695 
 8696     // Duplicate and multiply each shuffle by 4
 8697     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8698     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8699     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8700     __ psllw($vtmp$$XMMRegister, 2);
 8701 
 8702     // Duplicate again to create 4 copies of byte index
 8703     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8704     __ psllw($dst$$XMMRegister, 8);
 8705     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8706 
 8707     // Add 3,2,1,0 to get alternate byte index
 8708     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), noreg);
 8709     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8710   %}
 8711   ins_pipe( pipe_slow );
 8712 %}
 8713 
 8714 instruct rearrangeI(vec dst, vec shuffle) %{
 8715  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8716            Matcher::vector_length(n) == 4 && UseAVX < 2);
 8717   match(Set dst (VectorRearrange dst shuffle));
 8718   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8719   ins_encode %{
 8720     assert(UseSSE >= 4, "required");
 8721     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8722   %}
 8723   ins_pipe( pipe_slow );
 8724 %}
 8725 
 8726 instruct loadShuffleI_avx(vec dst, vec src) %{
 8727   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8728             UseAVX >= 2);
 8729   match(Set dst (VectorLoadShuffle src));
 8730   format %{ "vector_load_shuffle $dst, $src" %}
 8731   ins_encode %{
 8732   int vlen_enc = vector_length_encoding(this);
 8733     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8734   %}
 8735   ins_pipe( pipe_slow );
 8736 %}
 8737 
 8738 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8739   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8740             UseAVX >= 2);
 8741   match(Set dst (VectorRearrange src shuffle));
 8742   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8743   ins_encode %{
 8744     int vlen_enc = vector_length_encoding(this);
 8745     if (vlen_enc == Assembler::AVX_128bit) {
 8746       vlen_enc = Assembler::AVX_256bit;
 8747     }
 8748     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8749   %}
 8750   ins_pipe( pipe_slow );
 8751 %}
 8752 
 8753 // LoadShuffle/Rearrange for Long and Double
 8754 
 8755 instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
 8756   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8757             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8758   match(Set dst (VectorLoadShuffle src));
 8759   effect(TEMP dst, TEMP vtmp);
 8760   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
 8761   ins_encode %{
 8762     assert(UseAVX >= 2, "required");
 8763 
 8764     int vlen_enc = vector_length_encoding(this);
 8765     // Create a double word shuffle mask from long shuffle mask
 8766     // only double word shuffle instruction available on these platforms
 8767 
 8768     // Multiply each shuffle by two to get double word index
 8769     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8770     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8771 
 8772     // Duplicate each double word shuffle
 8773     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8774     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8775 
 8776     // Add one to get alternate double word index
 8777     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, noreg);
 8778   %}
 8779   ins_pipe( pipe_slow );
 8780 %}
 8781 
 8782 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8783   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8784             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8785   match(Set dst (VectorRearrange src shuffle));
 8786   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8787   ins_encode %{
 8788     assert(UseAVX >= 2, "required");
 8789 
 8790     int vlen_enc = vector_length_encoding(this);
 8791     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8792   %}
 8793   ins_pipe( pipe_slow );
 8794 %}
 8795 
 8796 instruct loadShuffleL_evex(vec dst, vec src) %{
 8797   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8798             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8799   match(Set dst (VectorLoadShuffle src));
 8800   format %{ "vector_load_shuffle $dst, $src" %}
 8801   ins_encode %{
 8802     assert(UseAVX > 2, "required");
 8803 
 8804     int vlen_enc = vector_length_encoding(this);
 8805     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8806   %}
 8807   ins_pipe( pipe_slow );
 8808 %}
 8809 
 8810 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8811   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8812             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8813   match(Set dst (VectorRearrange src shuffle));
 8814   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8815   ins_encode %{
 8816     assert(UseAVX > 2, "required");
 8817 
 8818     int vlen_enc = vector_length_encoding(this);
 8819     if (vlen_enc == Assembler::AVX_128bit) {
 8820       vlen_enc = Assembler::AVX_256bit;
 8821     }
 8822     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8823   %}
 8824   ins_pipe( pipe_slow );
 8825 %}
 8826 
 8827 // --------------------------------- FMA --------------------------------------
 8828 // a * b + c
 8829 
 8830 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8831   match(Set c (FmaVF  c (Binary a b)));
 8832   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8833   ins_cost(150);
 8834   ins_encode %{
 8835     assert(UseFMA, "not enabled");
 8836     int vlen_enc = vector_length_encoding(this);
 8837     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8838   %}
 8839   ins_pipe( pipe_slow );
 8840 %}
 8841 
 8842 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8843   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8844   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8845   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8846   ins_cost(150);
 8847   ins_encode %{
 8848     assert(UseFMA, "not enabled");
 8849     int vlen_enc = vector_length_encoding(this);
 8850     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8851   %}
 8852   ins_pipe( pipe_slow );
 8853 %}
 8854 
 8855 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8856   match(Set c (FmaVD  c (Binary a b)));
 8857   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8858   ins_cost(150);
 8859   ins_encode %{
 8860     assert(UseFMA, "not enabled");
 8861     int vlen_enc = vector_length_encoding(this);
 8862     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8863   %}
 8864   ins_pipe( pipe_slow );
 8865 %}
 8866 
 8867 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8868   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8869   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8870   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8871   ins_cost(150);
 8872   ins_encode %{
 8873     assert(UseFMA, "not enabled");
 8874     int vlen_enc = vector_length_encoding(this);
 8875     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8876   %}
 8877   ins_pipe( pipe_slow );
 8878 %}
 8879 
 8880 // --------------------------------- Vector Multiply Add --------------------------------------
 8881 
 8882 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8883   predicate(UseAVX == 0);
 8884   match(Set dst (MulAddVS2VI dst src1));
 8885   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8886   ins_encode %{
 8887     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8888   %}
 8889   ins_pipe( pipe_slow );
 8890 %}
 8891 
 8892 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8893   predicate(UseAVX > 0);
 8894   match(Set dst (MulAddVS2VI src1 src2));
 8895   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8896   ins_encode %{
 8897     int vlen_enc = vector_length_encoding(this);
 8898     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8899   %}
 8900   ins_pipe( pipe_slow );
 8901 %}
 8902 
 8903 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8904 
 8905 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8906   predicate(VM_Version::supports_avx512_vnni());
 8907   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8908   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8909   ins_encode %{
 8910     assert(UseAVX > 2, "required");
 8911     int vlen_enc = vector_length_encoding(this);
 8912     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8913   %}
 8914   ins_pipe( pipe_slow );
 8915   ins_cost(10);
 8916 %}
 8917 
 8918 // --------------------------------- PopCount --------------------------------------
 8919 
 8920 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8921   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8922   match(Set dst (PopCountVI src));
 8923   match(Set dst (PopCountVL src));
 8924   format %{ "vector_popcount_integral $dst, $src" %}
 8925   ins_encode %{
 8926     int opcode = this->ideal_Opcode();
 8927     int vlen_enc = vector_length_encoding(this, $src);
 8928     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8929     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8930     // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
 8931     // should be succeeded by its corresponding vector IR and following
 8932     // special handling should be removed.
 8933     if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
 8934       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8935     }
 8936   %}
 8937   ins_pipe( pipe_slow );
 8938 %}
 8939 
 8940 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8941   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8942   match(Set dst (PopCountVI src mask));
 8943   match(Set dst (PopCountVL src mask));
 8944   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8945   ins_encode %{
 8946     int vlen_enc = vector_length_encoding(this, $src);
 8947     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8948     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8949     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8950   %}
 8951   ins_pipe( pipe_slow );
 8952 %}
 8953 
 8954 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8955   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8956   match(Set dst (PopCountVI src));
 8957   match(Set dst (PopCountVL src));
 8958   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8959   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8960   ins_encode %{
 8961     int opcode = this->ideal_Opcode();
 8962     int vlen_enc = vector_length_encoding(this, $src);
 8963     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8964     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8965                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8966     // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
 8967     // should be succeeded by its corresponding vector IR and following
 8968     // special handling should be removed.
 8969     if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
 8970       if (VM_Version::supports_avx512vl()) {
 8971         __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8972       } else {
 8973         assert(VM_Version::supports_avx2(), "");
 8974         __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 8975         __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 8976       }
 8977     }
 8978   %}
 8979   ins_pipe( pipe_slow );
 8980 %}
 8981 
 8982 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8983 
 8984 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8985   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8986                                               Matcher::vector_length_in_bytes(n->in(1))));
 8987   match(Set dst (CountTrailingZerosV src));
 8988   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8989   ins_cost(400);
 8990   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8991   ins_encode %{
 8992     int vlen_enc = vector_length_encoding(this, $src);
 8993     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8994     BasicType rbt = Matcher::vector_element_basic_type(this);
 8995     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8996                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8997     // TODO: Once auto-vectorizer supports ConvL2I operation, CountTrailingZerosV
 8998     // should be succeeded by its corresponding vector IR and following
 8999     // special handling should be removed.
 9000     if (bt == T_LONG && rbt == T_INT) {
 9001       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 9002     }
 9003   %}
 9004   ins_pipe( pipe_slow );
 9005 %}
 9006 
 9007 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9008   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9009             VM_Version::supports_avx512cd() &&
 9010             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9011   match(Set dst (CountTrailingZerosV src));
 9012   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9013   ins_cost(400);
 9014   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 9015   ins_encode %{
 9016     int vlen_enc = vector_length_encoding(this, $src);
 9017     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9018     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9019                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 9020   %}
 9021   ins_pipe( pipe_slow );
 9022 %}
 9023 
 9024 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 9025   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9026   match(Set dst (CountTrailingZerosV src));
 9027   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 9028   ins_cost(400);
 9029   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 9030   ins_encode %{
 9031     int vlen_enc = vector_length_encoding(this, $src);
 9032     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9033     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9034                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 9035                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 9036   %}
 9037   ins_pipe( pipe_slow );
 9038 %}
 9039 
 9040 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9041   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9042   match(Set dst (CountTrailingZerosV src));
 9043   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9044   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9045   ins_encode %{
 9046     int vlen_enc = vector_length_encoding(this, $src);
 9047     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9048     BasicType rbt = Matcher::vector_element_basic_type(this);
 9049     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9050                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9051     // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
 9052     // should be succeeded by its corresponding vector IR and following
 9053     // special handling should be removed.
 9054     if (bt == T_LONG && rbt == T_INT) {
 9055       assert(VM_Version::supports_avx2(), "");
 9056       __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 9057       __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 9058     }
 9059   %}
 9060   ins_pipe( pipe_slow );
 9061 %}
 9062 
 9063 
 9064 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 9065 
 9066 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 9067   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 9068   effect(TEMP dst);
 9069   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9070   ins_encode %{
 9071     int vector_len = vector_length_encoding(this);
 9072     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 9073   %}
 9074   ins_pipe( pipe_slow );
 9075 %}
 9076 
 9077 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 9078   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 9079   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 9080   effect(TEMP dst);
 9081   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 9082   ins_encode %{
 9083     int vector_len = vector_length_encoding(this);
 9084     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 9085   %}
 9086   ins_pipe( pipe_slow );
 9087 %}
 9088 
 9089 // --------------------------------- Rotation Operations ----------------------------------
 9090 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 9091   match(Set dst (RotateLeftV src shift));
 9092   match(Set dst (RotateRightV src shift));
 9093   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 9094   ins_encode %{
 9095     int opcode      = this->ideal_Opcode();
 9096     int vector_len  = vector_length_encoding(this);
 9097     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9098     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 9099   %}
 9100   ins_pipe( pipe_slow );
 9101 %}
 9102 
 9103 instruct vprorate(vec dst, vec src, vec shift) %{
 9104   match(Set dst (RotateLeftV src shift));
 9105   match(Set dst (RotateRightV src shift));
 9106   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 9107   ins_encode %{
 9108     int opcode      = this->ideal_Opcode();
 9109     int vector_len  = vector_length_encoding(this);
 9110     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 9111     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 9112   %}
 9113   ins_pipe( pipe_slow );
 9114 %}
 9115 
 9116 // ---------------------------------- Masked Operations ------------------------------------
 9117 instruct vmasked_load_avx_non_subword(vec dst, memory mem, vec mask) %{
 9118   predicate(!n->in(3)->bottom_type()->isa_vectmask());
 9119   match(Set dst (LoadVectorMasked mem mask));
 9120   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9121   ins_encode %{
 9122     BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
 9123     int vlen_enc = vector_length_encoding(this);
 9124     __ vmovmask(elmType, $dst$$XMMRegister, $mem$$Address, $mask$$XMMRegister, vlen_enc);
 9125   %}
 9126   ins_pipe( pipe_slow );
 9127 %}
 9128 
 9129 
 9130 instruct vmasked_load_evex(vec dst, memory mem, kReg mask) %{
 9131   predicate(n->in(3)->bottom_type()->isa_vectmask());
 9132   match(Set dst (LoadVectorMasked mem mask));
 9133   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 9134   ins_encode %{
 9135     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 9136     int vector_len = vector_length_encoding(this);
 9137     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, false, vector_len);
 9138   %}
 9139   ins_pipe( pipe_slow );
 9140 %}
 9141 
 9142 instruct vmasked_store_avx_non_subword(memory mem, vec src, vec mask) %{
 9143   predicate(!n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9144   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9145   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9146   ins_encode %{
 9147     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9148     int vlen_enc = vector_length_encoding(src_node);
 9149     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9150     __ vmovmask(elmType, $mem$$Address, $src$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 9151   %}
 9152   ins_pipe( pipe_slow );
 9153 %}
 9154 
 9155 instruct vmasked_store_evex(memory mem, vec src, kReg mask) %{
 9156   predicate(n->in(3)->in(2)->bottom_type()->isa_vectmask());
 9157   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 9158   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 9159   ins_encode %{
 9160     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 9161     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 9162     int vlen_enc = vector_length_encoding(src_node);
 9163     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, true, vlen_enc);
 9164   %}
 9165   ins_pipe( pipe_slow );
 9166 %}
 9167 
 9168 #ifdef _LP64
 9169 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 9170   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 9171   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 9172   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 9173   ins_encode %{
 9174     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 9175     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 9176 
 9177     Label DONE;
 9178     int vlen_enc = vector_length_encoding(this, $src1);
 9179     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9180 
 9181     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 9182     __ mov64($dst$$Register, -1L);
 9183     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 9184     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 9185     __ jccb(Assembler::carrySet, DONE);
 9186     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 9187     __ notq($dst$$Register);
 9188     __ tzcntq($dst$$Register, $dst$$Register);
 9189     __ bind(DONE);
 9190   %}
 9191   ins_pipe( pipe_slow );
 9192 %}
 9193 
 9194 
 9195 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
 9196   match(Set dst (VectorMaskGen len));
 9197   effect(TEMP temp);
 9198   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 9199   ins_encode %{
 9200     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 9201   %}
 9202   ins_pipe( pipe_slow );
 9203 %}
 9204 
 9205 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 9206   match(Set dst (VectorMaskGen len));
 9207   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 9208   effect(TEMP temp);
 9209   ins_encode %{
 9210     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 9211     __ kmovql($dst$$KRegister, $temp$$Register);
 9212   %}
 9213   ins_pipe( pipe_slow );
 9214 %}
 9215 
 9216 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 9217   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9218   match(Set dst (VectorMaskToLong mask));
 9219   effect(TEMP dst, KILL cr);
 9220   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 9221   ins_encode %{
 9222     int opcode = this->ideal_Opcode();
 9223     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9224     int mask_len = Matcher::vector_length(this, $mask);
 9225     int mask_size = mask_len * type2aelembytes(mbt);
 9226     int vlen_enc = vector_length_encoding(this, $mask);
 9227     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9228                              $dst$$Register, mask_len, mask_size, vlen_enc);
 9229   %}
 9230   ins_pipe( pipe_slow );
 9231 %}
 9232 
 9233 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 9234   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9235   match(Set dst (VectorMaskToLong mask));
 9236   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9237   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9238   ins_encode %{
 9239     int opcode = this->ideal_Opcode();
 9240     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9241     int mask_len = Matcher::vector_length(this, $mask);
 9242     int vlen_enc = vector_length_encoding(this, $mask);
 9243     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9244                              $dst$$Register, mask_len, mbt, vlen_enc);
 9245   %}
 9246   ins_pipe( pipe_slow );
 9247 %}
 9248 
 9249 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9250   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9251   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9252   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9253   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9254   ins_encode %{
 9255     int opcode = this->ideal_Opcode();
 9256     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9257     int mask_len = Matcher::vector_length(this, $mask);
 9258     int vlen_enc = vector_length_encoding(this, $mask);
 9259     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9260                              $dst$$Register, mask_len, mbt, vlen_enc);
 9261   %}
 9262   ins_pipe( pipe_slow );
 9263 %}
 9264 
 9265 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9266   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9267   match(Set dst (VectorMaskTrueCount mask));
 9268   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9269   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9270   ins_encode %{
 9271     int opcode = this->ideal_Opcode();
 9272     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9273     int mask_len = Matcher::vector_length(this, $mask);
 9274     int mask_size = mask_len * type2aelembytes(mbt);
 9275     int vlen_enc = vector_length_encoding(this, $mask);
 9276     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9277                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9278   %}
 9279   ins_pipe( pipe_slow );
 9280 %}
 9281 
 9282 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9283   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9284   match(Set dst (VectorMaskTrueCount mask));
 9285   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9286   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9287   ins_encode %{
 9288     int opcode = this->ideal_Opcode();
 9289     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9290     int mask_len = Matcher::vector_length(this, $mask);
 9291     int vlen_enc = vector_length_encoding(this, $mask);
 9292     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9293                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9294   %}
 9295   ins_pipe( pipe_slow );
 9296 %}
 9297 
 9298 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9299   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9300   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9301   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9302   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9303   ins_encode %{
 9304     int opcode = this->ideal_Opcode();
 9305     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9306     int mask_len = Matcher::vector_length(this, $mask);
 9307     int vlen_enc = vector_length_encoding(this, $mask);
 9308     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9309                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9310   %}
 9311   ins_pipe( pipe_slow );
 9312 %}
 9313 
 9314 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9315   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9316   match(Set dst (VectorMaskFirstTrue mask));
 9317   match(Set dst (VectorMaskLastTrue mask));
 9318   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9319   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9320   ins_encode %{
 9321     int opcode = this->ideal_Opcode();
 9322     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9323     int mask_len = Matcher::vector_length(this, $mask);
 9324     int mask_size = mask_len * type2aelembytes(mbt);
 9325     int vlen_enc = vector_length_encoding(this, $mask);
 9326     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9327                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9328   %}
 9329   ins_pipe( pipe_slow );
 9330 %}
 9331 
 9332 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9333   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9334   match(Set dst (VectorMaskFirstTrue mask));
 9335   match(Set dst (VectorMaskLastTrue mask));
 9336   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9337   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9338   ins_encode %{
 9339     int opcode = this->ideal_Opcode();
 9340     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9341     int mask_len = Matcher::vector_length(this, $mask);
 9342     int vlen_enc = vector_length_encoding(this, $mask);
 9343     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9344                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9345   %}
 9346   ins_pipe( pipe_slow );
 9347 %}
 9348 
 9349 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9350   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9351   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9352   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9353   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9354   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9355   ins_encode %{
 9356     int opcode = this->ideal_Opcode();
 9357     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9358     int mask_len = Matcher::vector_length(this, $mask);
 9359     int vlen_enc = vector_length_encoding(this, $mask);
 9360     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9361                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9362   %}
 9363   ins_pipe( pipe_slow );
 9364 %}
 9365 
 9366 // --------------------------------- Compress/Expand Operations ---------------------------
 9367 
 9368 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9369   match(Set dst (CompressV src mask));
 9370   match(Set dst (ExpandV src mask));
 9371   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9372   ins_encode %{
 9373     int opcode = this->ideal_Opcode();
 9374     int vector_len = vector_length_encoding(this);
 9375     BasicType bt  = Matcher::vector_element_basic_type(this);
 9376     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9377   %}
 9378   ins_pipe( pipe_slow );
 9379 %}
 9380 
 9381 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9382   match(Set dst (CompressM mask));
 9383   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9384   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9385   ins_encode %{
 9386     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9387     int mask_len = Matcher::vector_length(this);
 9388     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9389   %}
 9390   ins_pipe( pipe_slow );
 9391 %}
 9392 
 9393 #endif // _LP64
 9394 
 9395 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9396 
 9397 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9398   predicate(!VM_Version::supports_gfni());
 9399   match(Set dst (ReverseV src));
 9400   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9401   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9402   ins_encode %{
 9403     int vec_enc = vector_length_encoding(this);
 9404     BasicType bt = Matcher::vector_element_basic_type(this);
 9405     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9406                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9407   %}
 9408   ins_pipe( pipe_slow );
 9409 %}
 9410 
 9411 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp) %{
 9412   predicate(VM_Version::supports_gfni());
 9413   match(Set dst (ReverseV src));
 9414   effect(TEMP dst, TEMP xtmp);
 9415   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $xtmp as TEMP" %}
 9416   ins_encode %{
 9417     int vec_enc = vector_length_encoding(this);
 9418     BasicType bt  = Matcher::vector_element_basic_type(this);
 9419     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9420     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, addr, vec_enc,
 9421                                $xtmp$$XMMRegister);
 9422   %}
 9423   ins_pipe( pipe_slow );
 9424 %}
 9425 
 9426 instruct vreverse_byte_reg(vec dst, vec src) %{
 9427   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9428   match(Set dst (ReverseBytesV src));
 9429   effect(TEMP dst);
 9430   format %{ "vector_reverse_byte $dst, $src" %}
 9431   ins_encode %{
 9432     int vec_enc = vector_length_encoding(this);
 9433     BasicType bt = Matcher::vector_element_basic_type(this);
 9434     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, vec_enc);
 9435   %}
 9436   ins_pipe( pipe_slow );
 9437 %}
 9438 
 9439 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9440   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9441   match(Set dst (ReverseBytesV src));
 9442   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9443   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9444   ins_encode %{
 9445     int vec_enc = vector_length_encoding(this);
 9446     BasicType bt = Matcher::vector_element_basic_type(this);
 9447     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9448                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9449   %}
 9450   ins_pipe( pipe_slow );
 9451 %}
 9452 
 9453 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9454 
 9455 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9456   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9457                                               Matcher::vector_length_in_bytes(n->in(1))));
 9458   match(Set dst (CountLeadingZerosV src));
 9459   format %{ "vector_count_leading_zeros $dst, $src" %}
 9460   ins_encode %{
 9461      int vlen_enc = vector_length_encoding(this, $src);
 9462      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9463      BasicType rbt = Matcher::vector_element_basic_type(this);
 9464      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9465                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9466      // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
 9467      // should be succeeded by its corresponding vector IR and following
 9468      // special handling should be removed.
 9469      if (rbt == T_INT && bt == T_LONG) {
 9470        __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 9471      }
 9472   %}
 9473   ins_pipe( pipe_slow );
 9474 %}
 9475 
 9476 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9477   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9478                                               Matcher::vector_length_in_bytes(n->in(1))));
 9479   match(Set dst (CountLeadingZerosV src mask));
 9480   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9481   ins_encode %{
 9482     int vlen_enc = vector_length_encoding(this, $src);
 9483     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9484     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9485     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9486                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9487   %}
 9488   ins_pipe( pipe_slow );
 9489 %}
 9490 
 9491 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9492   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9493             VM_Version::supports_avx512cd() &&
 9494             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9495   match(Set dst (CountLeadingZerosV src));
 9496   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9497   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9498   ins_encode %{
 9499     int vlen_enc = vector_length_encoding(this, $src);
 9500     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9501     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9502                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9503   %}
 9504   ins_pipe( pipe_slow );
 9505 %}
 9506 
 9507 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9508   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9509   match(Set dst (CountLeadingZerosV src));
 9510   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9511   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9512   ins_encode %{
 9513     int vlen_enc = vector_length_encoding(this, $src);
 9514     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9515     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9516                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9517                                        $rtmp$$Register, true, vlen_enc);
 9518   %}
 9519   ins_pipe( pipe_slow );
 9520 %}
 9521 
 9522 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9523   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9524             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9525   match(Set dst (CountLeadingZerosV src));
 9526   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9527   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9528   ins_encode %{
 9529     int vlen_enc = vector_length_encoding(this, $src);
 9530     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9531     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9532                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9533   %}
 9534   ins_pipe( pipe_slow );
 9535 %}
 9536 
 9537 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9538   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9539             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9540   match(Set dst (CountLeadingZerosV src));
 9541   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9542   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9543   ins_encode %{
 9544     int vlen_enc = vector_length_encoding(this, $src);
 9545     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9546     BasicType rbt = Matcher::vector_element_basic_type(this);
 9547     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9548                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9549     // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
 9550     // should be succeeded by its corresponding vector IR and following
 9551     // special handling should be removed.
 9552     if (rbt == T_INT && bt == T_LONG) {
 9553       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 9554     }
 9555   %}
 9556   ins_pipe( pipe_slow );
 9557 %}
 9558 
 9559 // ---------------------------------- Vector Masked Operations ------------------------------------
 9560 
 9561 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9562   match(Set dst (AddVB (Binary dst src2) mask));
 9563   match(Set dst (AddVS (Binary dst src2) mask));
 9564   match(Set dst (AddVI (Binary dst src2) mask));
 9565   match(Set dst (AddVL (Binary dst src2) mask));
 9566   match(Set dst (AddVF (Binary dst src2) mask));
 9567   match(Set dst (AddVD (Binary dst src2) mask));
 9568   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9569   ins_encode %{
 9570     int vlen_enc = vector_length_encoding(this);
 9571     BasicType bt = Matcher::vector_element_basic_type(this);
 9572     int opc = this->ideal_Opcode();
 9573     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9574                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9575   %}
 9576   ins_pipe( pipe_slow );
 9577 %}
 9578 
 9579 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9580   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9581   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9582   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9583   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9584   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9585   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9586   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9587   ins_encode %{
 9588     int vlen_enc = vector_length_encoding(this);
 9589     BasicType bt = Matcher::vector_element_basic_type(this);
 9590     int opc = this->ideal_Opcode();
 9591     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9592                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9593   %}
 9594   ins_pipe( pipe_slow );
 9595 %}
 9596 
 9597 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9598   match(Set dst (XorV (Binary dst src2) mask));
 9599   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9600   ins_encode %{
 9601     int vlen_enc = vector_length_encoding(this);
 9602     BasicType bt = Matcher::vector_element_basic_type(this);
 9603     int opc = this->ideal_Opcode();
 9604     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9605                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9606   %}
 9607   ins_pipe( pipe_slow );
 9608 %}
 9609 
 9610 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9611   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9612   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9613   ins_encode %{
 9614     int vlen_enc = vector_length_encoding(this);
 9615     BasicType bt = Matcher::vector_element_basic_type(this);
 9616     int opc = this->ideal_Opcode();
 9617     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9618                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9619   %}
 9620   ins_pipe( pipe_slow );
 9621 %}
 9622 
 9623 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9624   match(Set dst (OrV (Binary dst src2) mask));
 9625   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9626   ins_encode %{
 9627     int vlen_enc = vector_length_encoding(this);
 9628     BasicType bt = Matcher::vector_element_basic_type(this);
 9629     int opc = this->ideal_Opcode();
 9630     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9631                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9632   %}
 9633   ins_pipe( pipe_slow );
 9634 %}
 9635 
 9636 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9637   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9638   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9639   ins_encode %{
 9640     int vlen_enc = vector_length_encoding(this);
 9641     BasicType bt = Matcher::vector_element_basic_type(this);
 9642     int opc = this->ideal_Opcode();
 9643     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9644                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9645   %}
 9646   ins_pipe( pipe_slow );
 9647 %}
 9648 
 9649 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9650   match(Set dst (AndV (Binary dst src2) mask));
 9651   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9652   ins_encode %{
 9653     int vlen_enc = vector_length_encoding(this);
 9654     BasicType bt = Matcher::vector_element_basic_type(this);
 9655     int opc = this->ideal_Opcode();
 9656     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9657                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9658   %}
 9659   ins_pipe( pipe_slow );
 9660 %}
 9661 
 9662 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9663   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9664   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9665   ins_encode %{
 9666     int vlen_enc = vector_length_encoding(this);
 9667     BasicType bt = Matcher::vector_element_basic_type(this);
 9668     int opc = this->ideal_Opcode();
 9669     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9670                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9671   %}
 9672   ins_pipe( pipe_slow );
 9673 %}
 9674 
 9675 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9676   match(Set dst (SubVB (Binary dst src2) mask));
 9677   match(Set dst (SubVS (Binary dst src2) mask));
 9678   match(Set dst (SubVI (Binary dst src2) mask));
 9679   match(Set dst (SubVL (Binary dst src2) mask));
 9680   match(Set dst (SubVF (Binary dst src2) mask));
 9681   match(Set dst (SubVD (Binary dst src2) mask));
 9682   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9683   ins_encode %{
 9684     int vlen_enc = vector_length_encoding(this);
 9685     BasicType bt = Matcher::vector_element_basic_type(this);
 9686     int opc = this->ideal_Opcode();
 9687     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9688                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9689   %}
 9690   ins_pipe( pipe_slow );
 9691 %}
 9692 
 9693 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9694   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9695   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9696   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9697   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9698   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9699   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9700   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9701   ins_encode %{
 9702     int vlen_enc = vector_length_encoding(this);
 9703     BasicType bt = Matcher::vector_element_basic_type(this);
 9704     int opc = this->ideal_Opcode();
 9705     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9706                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9707   %}
 9708   ins_pipe( pipe_slow );
 9709 %}
 9710 
 9711 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9712   match(Set dst (MulVS (Binary dst src2) mask));
 9713   match(Set dst (MulVI (Binary dst src2) mask));
 9714   match(Set dst (MulVL (Binary dst src2) mask));
 9715   match(Set dst (MulVF (Binary dst src2) mask));
 9716   match(Set dst (MulVD (Binary dst src2) mask));
 9717   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9718   ins_encode %{
 9719     int vlen_enc = vector_length_encoding(this);
 9720     BasicType bt = Matcher::vector_element_basic_type(this);
 9721     int opc = this->ideal_Opcode();
 9722     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9723                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9724   %}
 9725   ins_pipe( pipe_slow );
 9726 %}
 9727 
 9728 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9729   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9730   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9731   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9732   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9733   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9734   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9735   ins_encode %{
 9736     int vlen_enc = vector_length_encoding(this);
 9737     BasicType bt = Matcher::vector_element_basic_type(this);
 9738     int opc = this->ideal_Opcode();
 9739     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9740                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9741   %}
 9742   ins_pipe( pipe_slow );
 9743 %}
 9744 
 9745 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9746   match(Set dst (SqrtVF dst mask));
 9747   match(Set dst (SqrtVD dst mask));
 9748   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9749   ins_encode %{
 9750     int vlen_enc = vector_length_encoding(this);
 9751     BasicType bt = Matcher::vector_element_basic_type(this);
 9752     int opc = this->ideal_Opcode();
 9753     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9754                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9755   %}
 9756   ins_pipe( pipe_slow );
 9757 %}
 9758 
 9759 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9760   match(Set dst (DivVF (Binary dst src2) mask));
 9761   match(Set dst (DivVD (Binary dst src2) mask));
 9762   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9763   ins_encode %{
 9764     int vlen_enc = vector_length_encoding(this);
 9765     BasicType bt = Matcher::vector_element_basic_type(this);
 9766     int opc = this->ideal_Opcode();
 9767     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9768                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9769   %}
 9770   ins_pipe( pipe_slow );
 9771 %}
 9772 
 9773 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9774   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9775   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9776   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9777   ins_encode %{
 9778     int vlen_enc = vector_length_encoding(this);
 9779     BasicType bt = Matcher::vector_element_basic_type(this);
 9780     int opc = this->ideal_Opcode();
 9781     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9782                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9783   %}
 9784   ins_pipe( pipe_slow );
 9785 %}
 9786 
 9787 
 9788 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9789   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9790   match(Set dst (RotateRightV (Binary dst shift) mask));
 9791   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9792   ins_encode %{
 9793     int vlen_enc = vector_length_encoding(this);
 9794     BasicType bt = Matcher::vector_element_basic_type(this);
 9795     int opc = this->ideal_Opcode();
 9796     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9797                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9798   %}
 9799   ins_pipe( pipe_slow );
 9800 %}
 9801 
 9802 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9803   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9804   match(Set dst (RotateRightV (Binary dst src2) mask));
 9805   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9806   ins_encode %{
 9807     int vlen_enc = vector_length_encoding(this);
 9808     BasicType bt = Matcher::vector_element_basic_type(this);
 9809     int opc = this->ideal_Opcode();
 9810     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9811                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9812   %}
 9813   ins_pipe( pipe_slow );
 9814 %}
 9815 
 9816 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9817   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9818   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9819   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9820   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9821   ins_encode %{
 9822     int vlen_enc = vector_length_encoding(this);
 9823     BasicType bt = Matcher::vector_element_basic_type(this);
 9824     int opc = this->ideal_Opcode();
 9825     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9826                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9827   %}
 9828   ins_pipe( pipe_slow );
 9829 %}
 9830 
 9831 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9832   predicate(!n->as_ShiftV()->is_var_shift());
 9833   match(Set dst (LShiftVS (Binary dst src2) mask));
 9834   match(Set dst (LShiftVI (Binary dst src2) mask));
 9835   match(Set dst (LShiftVL (Binary dst src2) mask));
 9836   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9837   ins_encode %{
 9838     int vlen_enc = vector_length_encoding(this);
 9839     BasicType bt = Matcher::vector_element_basic_type(this);
 9840     int opc = this->ideal_Opcode();
 9841     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9842                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9843   %}
 9844   ins_pipe( pipe_slow );
 9845 %}
 9846 
 9847 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9848   predicate(n->as_ShiftV()->is_var_shift());
 9849   match(Set dst (LShiftVS (Binary dst src2) mask));
 9850   match(Set dst (LShiftVI (Binary dst src2) mask));
 9851   match(Set dst (LShiftVL (Binary dst src2) mask));
 9852   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9853   ins_encode %{
 9854     int vlen_enc = vector_length_encoding(this);
 9855     BasicType bt = Matcher::vector_element_basic_type(this);
 9856     int opc = this->ideal_Opcode();
 9857     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9858                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9859   %}
 9860   ins_pipe( pipe_slow );
 9861 %}
 9862 
 9863 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9864   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9865   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9866   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9867   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9868   ins_encode %{
 9869     int vlen_enc = vector_length_encoding(this);
 9870     BasicType bt = Matcher::vector_element_basic_type(this);
 9871     int opc = this->ideal_Opcode();
 9872     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9873                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9874   %}
 9875   ins_pipe( pipe_slow );
 9876 %}
 9877 
 9878 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9879   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9880   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9881   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9882   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9883   ins_encode %{
 9884     int vlen_enc = vector_length_encoding(this);
 9885     BasicType bt = Matcher::vector_element_basic_type(this);
 9886     int opc = this->ideal_Opcode();
 9887     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9888                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9889   %}
 9890   ins_pipe( pipe_slow );
 9891 %}
 9892 
 9893 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9894   predicate(!n->as_ShiftV()->is_var_shift());
 9895   match(Set dst (RShiftVS (Binary dst src2) mask));
 9896   match(Set dst (RShiftVI (Binary dst src2) mask));
 9897   match(Set dst (RShiftVL (Binary dst src2) mask));
 9898   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9899   ins_encode %{
 9900     int vlen_enc = vector_length_encoding(this);
 9901     BasicType bt = Matcher::vector_element_basic_type(this);
 9902     int opc = this->ideal_Opcode();
 9903     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9904                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9905   %}
 9906   ins_pipe( pipe_slow );
 9907 %}
 9908 
 9909 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9910   predicate(n->as_ShiftV()->is_var_shift());
 9911   match(Set dst (RShiftVS (Binary dst src2) mask));
 9912   match(Set dst (RShiftVI (Binary dst src2) mask));
 9913   match(Set dst (RShiftVL (Binary dst src2) mask));
 9914   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9915   ins_encode %{
 9916     int vlen_enc = vector_length_encoding(this);
 9917     BasicType bt = Matcher::vector_element_basic_type(this);
 9918     int opc = this->ideal_Opcode();
 9919     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9920                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9921   %}
 9922   ins_pipe( pipe_slow );
 9923 %}
 9924 
 9925 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9926   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9927   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9928   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9929   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9930   ins_encode %{
 9931     int vlen_enc = vector_length_encoding(this);
 9932     BasicType bt = Matcher::vector_element_basic_type(this);
 9933     int opc = this->ideal_Opcode();
 9934     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9935                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9936   %}
 9937   ins_pipe( pipe_slow );
 9938 %}
 9939 
 9940 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9941   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9942   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9943   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9944   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9945   ins_encode %{
 9946     int vlen_enc = vector_length_encoding(this);
 9947     BasicType bt = Matcher::vector_element_basic_type(this);
 9948     int opc = this->ideal_Opcode();
 9949     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9950                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9951   %}
 9952   ins_pipe( pipe_slow );
 9953 %}
 9954 
 9955 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9956   predicate(!n->as_ShiftV()->is_var_shift());
 9957   match(Set dst (URShiftVS (Binary dst src2) mask));
 9958   match(Set dst (URShiftVI (Binary dst src2) mask));
 9959   match(Set dst (URShiftVL (Binary dst src2) mask));
 9960   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9961   ins_encode %{
 9962     int vlen_enc = vector_length_encoding(this);
 9963     BasicType bt = Matcher::vector_element_basic_type(this);
 9964     int opc = this->ideal_Opcode();
 9965     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9966                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9967   %}
 9968   ins_pipe( pipe_slow );
 9969 %}
 9970 
 9971 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9972   predicate(n->as_ShiftV()->is_var_shift());
 9973   match(Set dst (URShiftVS (Binary dst src2) mask));
 9974   match(Set dst (URShiftVI (Binary dst src2) mask));
 9975   match(Set dst (URShiftVL (Binary dst src2) mask));
 9976   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9977   ins_encode %{
 9978     int vlen_enc = vector_length_encoding(this);
 9979     BasicType bt = Matcher::vector_element_basic_type(this);
 9980     int opc = this->ideal_Opcode();
 9981     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9982                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9983   %}
 9984   ins_pipe( pipe_slow );
 9985 %}
 9986 
 9987 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9988   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9989   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9990   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9991   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9992   ins_encode %{
 9993     int vlen_enc = vector_length_encoding(this);
 9994     BasicType bt = Matcher::vector_element_basic_type(this);
 9995     int opc = this->ideal_Opcode();
 9996     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9997                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9998   %}
 9999   ins_pipe( pipe_slow );
10000 %}
10001 
10002 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
10003   match(Set dst (MaxV (Binary dst src2) mask));
10004   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10005   ins_encode %{
10006     int vlen_enc = vector_length_encoding(this);
10007     BasicType bt = Matcher::vector_element_basic_type(this);
10008     int opc = this->ideal_Opcode();
10009     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10010                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10011   %}
10012   ins_pipe( pipe_slow );
10013 %}
10014 
10015 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
10016   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
10017   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
10018   ins_encode %{
10019     int vlen_enc = vector_length_encoding(this);
10020     BasicType bt = Matcher::vector_element_basic_type(this);
10021     int opc = this->ideal_Opcode();
10022     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10023                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10024   %}
10025   ins_pipe( pipe_slow );
10026 %}
10027 
10028 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
10029   match(Set dst (MinV (Binary dst src2) mask));
10030   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10031   ins_encode %{
10032     int vlen_enc = vector_length_encoding(this);
10033     BasicType bt = Matcher::vector_element_basic_type(this);
10034     int opc = this->ideal_Opcode();
10035     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10036                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
10037   %}
10038   ins_pipe( pipe_slow );
10039 %}
10040 
10041 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
10042   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
10043   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
10044   ins_encode %{
10045     int vlen_enc = vector_length_encoding(this);
10046     BasicType bt = Matcher::vector_element_basic_type(this);
10047     int opc = this->ideal_Opcode();
10048     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10049                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
10050   %}
10051   ins_pipe( pipe_slow );
10052 %}
10053 
10054 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
10055   match(Set dst (VectorRearrange (Binary dst src2) mask));
10056   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
10057   ins_encode %{
10058     int vlen_enc = vector_length_encoding(this);
10059     BasicType bt = Matcher::vector_element_basic_type(this);
10060     int opc = this->ideal_Opcode();
10061     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10062                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
10063   %}
10064   ins_pipe( pipe_slow );
10065 %}
10066 
10067 instruct vabs_masked(vec dst, kReg mask) %{
10068   match(Set dst (AbsVB dst mask));
10069   match(Set dst (AbsVS dst mask));
10070   match(Set dst (AbsVI dst mask));
10071   match(Set dst (AbsVL dst mask));
10072   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
10073   ins_encode %{
10074     int vlen_enc = vector_length_encoding(this);
10075     BasicType bt = Matcher::vector_element_basic_type(this);
10076     int opc = this->ideal_Opcode();
10077     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10078                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
10079   %}
10080   ins_pipe( pipe_slow );
10081 %}
10082 
10083 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
10084   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
10085   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
10086   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10087   ins_encode %{
10088     int vlen_enc = vector_length_encoding(this);
10089     BasicType bt = Matcher::vector_element_basic_type(this);
10090     int opc = this->ideal_Opcode();
10091     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10092                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
10093   %}
10094   ins_pipe( pipe_slow );
10095 %}
10096 
10097 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
10098   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
10099   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
10100   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
10101   ins_encode %{
10102     int vlen_enc = vector_length_encoding(this);
10103     BasicType bt = Matcher::vector_element_basic_type(this);
10104     int opc = this->ideal_Opcode();
10105     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
10106                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
10107   %}
10108   ins_pipe( pipe_slow );
10109 %}
10110 
10111 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask) %{
10112   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
10113   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask" %}
10114   ins_encode %{
10115     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
10116     int vlen_enc = vector_length_encoding(this, $src1);
10117     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
10118 
10119     // Comparison i
10120     switch (src1_elem_bt) {
10121       case T_BYTE: {
10122         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10123         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10124         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10125         break;
10126       }
10127       case T_SHORT: {
10128         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10129         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10130         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10131         break;
10132       }
10133       case T_INT: {
10134         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10135         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10136         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10137         break;
10138       }
10139       case T_LONG: {
10140         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
10141         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
10142         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
10143         break;
10144       }
10145       case T_FLOAT: {
10146         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10147         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10148         break;
10149       }
10150       case T_DOUBLE: {
10151         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
10152         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
10153         break;
10154       }
10155       default: assert(false, "%s", type2name(src1_elem_bt)); break;
10156     }
10157   %}
10158   ins_pipe( pipe_slow );
10159 %}
10160 
10161 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
10162   predicate(Matcher::vector_length(n) <= 32);
10163   match(Set dst (MaskAll src));
10164   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
10165   ins_encode %{
10166     int mask_len = Matcher::vector_length(this);
10167     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
10168   %}
10169   ins_pipe( pipe_slow );
10170 %}
10171 
10172 #ifdef _LP64
10173 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
10174   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
10175   match(Set dst (XorVMask src (MaskAll cnt)));
10176   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
10177   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
10178   ins_encode %{
10179     uint masklen = Matcher::vector_length(this);
10180     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
10181   %}
10182   ins_pipe( pipe_slow );
10183 %}
10184 
10185 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
10186   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
10187             (Matcher::vector_length(n) == 16) ||
10188             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
10189   match(Set dst (XorVMask src (MaskAll cnt)));
10190   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
10191   ins_encode %{
10192     uint masklen = Matcher::vector_length(this);
10193     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
10194   %}
10195   ins_pipe( pipe_slow );
10196 %}
10197 
10198 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
10199   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
10200   match(Set dst (VectorLongToMask src));
10201   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
10202   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
10203   ins_encode %{
10204     int mask_len = Matcher::vector_length(this);
10205     int vec_enc  = vector_length_encoding(mask_len);
10206     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10207                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
10208   %}
10209   ins_pipe( pipe_slow );
10210 %}
10211 
10212 
10213 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
10214   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
10215   match(Set dst (VectorLongToMask src));
10216   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
10217   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
10218   ins_encode %{
10219     int mask_len = Matcher::vector_length(this);
10220     assert(mask_len <= 32, "invalid mask length");
10221     int vec_enc  = vector_length_encoding(mask_len);
10222     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
10223                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
10224   %}
10225   ins_pipe( pipe_slow );
10226 %}
10227 
10228 instruct long_to_mask_evex(kReg dst, rRegL src) %{
10229   predicate(n->bottom_type()->isa_vectmask());
10230   match(Set dst (VectorLongToMask src));
10231   format %{ "long_to_mask_evex $dst, $src\t!" %}
10232   ins_encode %{
10233     __ kmov($dst$$KRegister, $src$$Register);
10234   %}
10235   ins_pipe( pipe_slow );
10236 %}
10237 #endif
10238 
10239 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10240   match(Set dst (AndVMask src1 src2));
10241   match(Set dst (OrVMask src1 src2));
10242   match(Set dst (XorVMask src1 src2));
10243   effect(TEMP kscratch);
10244   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10245   ins_encode %{
10246     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10247     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10248     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10249     uint masklen = Matcher::vector_length(this);
10250     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10251     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10252   %}
10253   ins_pipe( pipe_slow );
10254 %}
10255 
10256 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10257   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10258   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10259   ins_encode %{
10260     int vlen_enc = vector_length_encoding(this);
10261     BasicType bt = Matcher::vector_element_basic_type(this);
10262     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10263                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10264   %}
10265   ins_pipe( pipe_slow );
10266 %}
10267 
10268 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10269   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10270   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10271   ins_encode %{
10272     int vlen_enc = vector_length_encoding(this);
10273     BasicType bt = Matcher::vector_element_basic_type(this);
10274     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10275                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10276   %}
10277   ins_pipe( pipe_slow );
10278 %}
10279 
10280 instruct castMM(kReg dst)
10281 %{
10282   match(Set dst (CastVV dst));
10283 
10284   size(0);
10285   format %{ "# castVV of $dst" %}
10286   ins_encode(/* empty encoding */);
10287   ins_cost(0);
10288   ins_pipe(empty);
10289 %}
10290 
10291 instruct castVV(vec dst)
10292 %{
10293   match(Set dst (CastVV dst));
10294 
10295   size(0);
10296   format %{ "# castVV of $dst" %}
10297   ins_encode(/* empty encoding */);
10298   ins_cost(0);
10299   ins_pipe(empty);
10300 %}
10301 
10302 instruct castVVLeg(legVec dst)
10303 %{
10304   match(Set dst (CastVV dst));
10305 
10306   size(0);
10307   format %{ "# castVV of $dst" %}
10308   ins_encode(/* empty encoding */);
10309   ins_cost(0);
10310   ins_pipe(empty);
10311 %}
10312 
10313 instruct FloatClassCheck_reg_reg_vfpclass(rRegI dst, regF src, kReg ktmp, rFlagsReg cr)
10314 %{
10315   match(Set dst (IsInfiniteF src));
10316   effect(TEMP ktmp, KILL cr);
10317   format %{ "float_class_check $dst, $src" %}
10318   ins_encode %{
10319     __ vfpclassss($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10320     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10321   %}
10322   ins_pipe(pipe_slow);
10323 %}
10324 
10325 instruct DoubleClassCheck_reg_reg_vfpclass(rRegI dst, regD src, kReg ktmp, rFlagsReg cr)
10326 %{
10327   match(Set dst (IsInfiniteD src));
10328   effect(TEMP ktmp, KILL cr);
10329   format %{ "double_class_check $dst, $src" %}
10330   ins_encode %{
10331     __ vfpclasssd($ktmp$$KRegister, $src$$XMMRegister, 0x18);
10332     __ kmovbl($dst$$Register, $ktmp$$KRegister);
10333   %}
10334   ins_pipe(pipe_slow);
10335 %}
10336 
10337